From 2c1412e05c19345ad1a627e23aa6631b27ec1f66 Mon Sep 17 00:00:00 2001 From: Vasilev Date: Wed, 19 Jul 2023 11:09:35 +0000 Subject: [PATCH 01/49] changed logging --- lightautoml/automl/blend.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lightautoml/automl/blend.py b/lightautoml/automl/blend.py index f5bea128..d0c78615 100644 --- a/lightautoml/automl/blend.py +++ b/lightautoml/automl/blend.py @@ -361,11 +361,14 @@ def _optimize(self, splitted_preds: Sequence[NumpyDataset]) -> np.ndarray: length = len(splitted_preds) candidate = np.ones(length, dtype=np.float32) / length + pre_candidate = candidate best_pred = self._get_weighted_pred(splitted_preds, candidate) best_score = self.score(best_pred) logger.info("Blending: optimization starts with equal weights and score \x1b[1m{0}\x1b[0m".format(best_score)) score = best_score + iter_best_score = None + iter_best_weights = None for _ in range(self.max_iters): flg_no_upd = True for i in range(len(splitted_preds)): @@ -381,17 +384,21 @@ def _optimize(self, splitted_preds: Sequence[NumpyDataset]) -> np.ndarray: ) w = opt_res.x score = -opt_res.fun + pre_candidate = self._get_candidate(candidate, i, w) + if i == 0 or iter_best_score < score: + iter_best_score = score + iter_best_weights = pre_candidate if score > best_score: flg_no_upd = False best_score = score # if w < self.max_nonzero_coef: # w = 0 - candidate = self._get_candidate(candidate, i, w) + candidate = pre_candidate logger.info( "Blending: iteration \x1b[1m{0}\x1b[0m: score = \x1b[1m{1}\x1b[0m, weights = \x1b[1m{2}\x1b[0m".format( - _, score, candidate + _, iter_best_score, iter_best_weights ) ) From bb955756574ee62fbab5f658fcda49a0dd85ceb1 Mon Sep 17 00:00:00 2001 From: Vasilev Date: Wed, 19 Jul 2023 11:44:52 +0000 Subject: [PATCH 02/49] lint fix --- lightautoml/addons/autots/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lightautoml/addons/autots/base.py b/lightautoml/addons/autots/base.py index 5cc8cea1..d84ac2ba 100644 --- a/lightautoml/addons/autots/base.py +++ b/lightautoml/addons/autots/base.py @@ -192,7 +192,8 @@ def fit_predict(self, train_data, roles, verbose=0): if hasattr(self.TM, "automl_trend"): self.datetime_step = ( - pd.to_datetime(train_data[self.datetime_key]).iloc[1] - pd.to_datetime(train_data[self.datetime_key]).iloc[0] + pd.to_datetime(train_data[self.datetime_key]).iloc[1] + - pd.to_datetime(train_data[self.datetime_key]).iloc[0] ) # fit main train_detrend = train_data.copy() From 3d76b96bdb80ae11a074fc1884145697b58388ad Mon Sep 17 00:00:00 2001 From: Vasilev Date: Wed, 19 Jul 2023 11:09:35 +0000 Subject: [PATCH 03/49] changed logging --- lightautoml/automl/blend.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lightautoml/automl/blend.py b/lightautoml/automl/blend.py index f5bea128..d0c78615 100644 --- a/lightautoml/automl/blend.py +++ b/lightautoml/automl/blend.py @@ -361,11 +361,14 @@ def _optimize(self, splitted_preds: Sequence[NumpyDataset]) -> np.ndarray: length = len(splitted_preds) candidate = np.ones(length, dtype=np.float32) / length + pre_candidate = candidate best_pred = self._get_weighted_pred(splitted_preds, candidate) best_score = self.score(best_pred) logger.info("Blending: optimization starts with equal weights and score \x1b[1m{0}\x1b[0m".format(best_score)) score = best_score + iter_best_score = None + iter_best_weights = None for _ in range(self.max_iters): flg_no_upd = True for i in range(len(splitted_preds)): @@ -381,17 +384,21 @@ def _optimize(self, splitted_preds: Sequence[NumpyDataset]) -> np.ndarray: ) w = opt_res.x score = -opt_res.fun + pre_candidate = self._get_candidate(candidate, i, w) + if i == 0 or iter_best_score < score: + iter_best_score = score + iter_best_weights = pre_candidate if score > best_score: flg_no_upd = False best_score = score # if w < self.max_nonzero_coef: # w = 0 - candidate = self._get_candidate(candidate, i, w) + candidate = pre_candidate logger.info( "Blending: iteration \x1b[1m{0}\x1b[0m: score = \x1b[1m{1}\x1b[0m, weights = \x1b[1m{2}\x1b[0m".format( - _, score, candidate + _, iter_best_score, iter_best_weights ) ) From 697ebfd013a681520e3f594008eb2d23a891738b Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 25 Jul 2023 10:25:13 +0000 Subject: [PATCH 04/49] Added timm cv-library, bug-fix in multilabel linear model, added softmax with clip to repair warning in log_loss from sklearn --- lightautoml/automl/presets/image_config.yml | 145 +++++++++++++++++- lightautoml/image/image.py | 78 ++++------ lightautoml/ml_algo/linear_sklearn.py | 5 +- .../ml_algo/torch_based/linear_model.py | 14 +- lightautoml/ml_algo/utils.py | 35 +++++ .../pipelines/features/image_pipeline.py | 4 +- lightautoml/tasks/common_metric.py | 6 +- lightautoml/transformers/image.py | 9 +- pyproject.toml | 10 +- 9 files changed, 232 insertions(+), 74 deletions(-) diff --git a/lightautoml/automl/presets/image_config.yml b/lightautoml/automl/presets/image_config.yml index 7d4d17dc..9f0ee138 100644 --- a/lightautoml/automl/presets/image_config.yml +++ b/lightautoml/automl/presets/image_config.yml @@ -167,7 +167,7 @@ cv_simple_features: autocv_features: # model name from effnet family - embed_model: 'efficientnet-b0' + embed_model: 'vit_base_patch16_224.augreg_in21k' weights_path: null # directory for save / load cache cache_dir: './cache_CV' @@ -175,6 +175,147 @@ autocv_features: device: 'cuda:0' n_jobs: 4 random_state: 42 - is_advprop: True batch_size: 128 verbose: True + + +nn_pipeline_params: + # use quantile transformer for numerical columns + use_qnt: false + # number of quantiles to be computed + n_quantiles: null + # maximum number of samples used to estimate the quantiles for computational efficiency + subsample: 1000000000 + # marginal distribution for the transformed data. The choices are 'uniform' or 'normal' + output_distribution: normal + # add noise with certain std to dataset before quantile transformation to make data more smooth + noise: 0.001 + # if number of quantiles is none then it equals dataset size / factor + qnt_factor: 30 + # use target encoding for categorical columns + use_te: false + # max number of categories to generate intersections + top_intersections: 5 + max_bin_count: 10 + # max depth of cat intersection + max_intersection_depth: 3 + # subsample to calc data statistics + te_subsample: null + # should we output sparse if ohe encoding was used during cat handling + sparse_ohe: auto + # switch to target encoding if high cardinality + auto_unique_co: 50 + # output encoded categories or embed idxs + output_categories: true + # cutoff if use target encoding in cat handling on multiclass task if number of classes is high + multiclass_te_co: 3 + + + +nn_params: + # Look for NN train params here. + # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model + model: denselight + # use model with custom embeddings + model_with_emb: false + # tune custom network + tuned: false + # fewf + optimization_search_space: null + # str in torch.nn loss functions or nn.Module or func with (y_pred, y_true) args + loss: null + loss_params: {} + # calculate loss on logits or on predictions of model for classification tasks + loss_on_logits: true + # clip gradient before loss backprop + clip_grad: false + clip_grad_params: {} + drop_rate: 0.1 + # add fc layer before model with certain dim + num_init_features: null + # activation function (str in torch.nn activation functions or custom nn.Module) + act_fun: ReLU + # add noise after dropout layer for more regularization + use_noise: false + # noise parameter + noise_std: 0.05 + # use BatchNorm + use_bn: true + # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] + hidden_size: [512, 512, 512] + # dim of intermediate fc is increased times this factor in ResnetModel layer + hid_factor: [2, 2] + # list of number of layers within each DenseModel block + block_config: [2, 2] + # portion of neuron to drop after DenseBlock + compression: 0.5 + # output dim of every DenseLayer + growth_size: 256 + # dim of intermediate fc is increased times this factor in DenseModel layer + bn_factor: 2 + # early stopping and scheduler use metric + stop_by_metric: false + random_state: 42 + # path to save model state + # if None: stay in memory (CPU) + path_to_save: null + # optimizer + opt: Adam + # params of optimizer + opt_params: { 'lr': 0.0003, 'weight_decay': 0 } + # scheduler + sch: ReduceLROnPlateau + # params of ReduceLROnPlateau scheduler + scheduler_params: { 'patience': 5, 'factor': 0.5, 'min_lr': 0.00001 } + # using snapshot ensembles + # https://arxiv.org/abs/1704.00109 + is_snap: false + # params of snapshots: + # k - number of best snapshots (in terms of loss) + # early_stopping - use early stopping + # patience - early_stopping patience + # swa - stochastic weight average - averaging of snapshots weights and replace base model + # https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/ for idea details (different implementation) + # use swa with disabled is_snap + snap_params: { 'k': 3, 'early_stopping': True, 'patience': 10, 'swa': True } + # init last linear layer: + # zeros for weights, mean value for bias in regression, inverse sigmoid mean for binary, argmax for multiclass + init_bias: true + # verbose and create snapshots inside one training epoch every k steps + verbose_inside: null + # verbose every k epochs + verbose: 1 + # show progress bar for each epoch during batchwise training + verbose_bar: false + n_epochs: 50 + input_bn: False + emb_dropout: 0.1 + emb_ratio: 3 + max_emb_size: 256 + use_cont: true + use_cat: true + use_text: false + #set cudnn backend + deterministic: true + # use DP for model training + # currently, must be set to FALSE value + multigpu: false + # device + device: cuda:0 + # use defualt dataset config or custom torch dataset + dataset: UniversalDataset + pin_memory: false + # training and inference batch size + bs: 512 + num_workers: 0 + + tuning_params: + # pretrain tuner on holdout set. True - fast/ False - accurate + # Ex. if you have 5-fold cv, validate tuner only on 1 fold + fit_on_holdout: True + # max tuning iter for lightgbm. Auto - depends on dataset + # smaller dataset gets more iters (int or 'auto') + max_tuning_iter: 25 + # max tuning time. Tuning time might be set lower during train by automl's timer, but cannot be higher + max_tuning_time: 3600 + freeze_defaults: False diff --git a/lightautoml/image/image.py b/lightautoml/image/image.py index 4950dc3c..44a0cfb2 100644 --- a/lightautoml/image/image.py +++ b/lightautoml/image/image.py @@ -22,20 +22,14 @@ try: - from albumentations import Compose - from albumentations import Normalize - from albumentations import Resize - from albumentations.pytorch import ToTensorV2 -except: - import warnings + import timm - warnings.warn("'albumentations' - package isn't installed") -try: - from efficientnet_pytorch import EfficientNet + from timm.data import resolve_data_config + from timm.data.transforms_factory import create_transform except: import warnings - warnings.warn("'efficientnet_pytorch' - package isn't installed") + warnings.warn("'timm' - package isn't installed") from joblib import Parallel from joblib import delayed @@ -171,39 +165,31 @@ def transform(self, samples: Sequence[str]) -> np.ndarray: return np.vstack(res) -class EffNetImageEmbedder(nn.Module): - """Class to compute EfficientNet embeddings.""" +class TimmModelEmbedder(nn.Module): + """Class to compute TimmModels embeddings.""" def __init__( self, - model_name: str = "efficientnet-b0", + model_name: str = "efficientnet_b0.ra_in1k", weights_path: Optional[str] = None, - is_advprop: bool = True, device=torch.device("cuda:0"), ): - """Pytorch module for image embeddings based on efficient-net model. + """Pytorch module for image embeddings based on timm models. Args: model_name: Name of effnet model. weights_path: Path to saved weights. - is_advprop: Use adversarial training. device: Device to use. """ - super(EffNetImageEmbedder, self).__init__() + super(TimmModelEmbedder, self).__init__() self.device = device self.model = ( - EfficientNet.from_pretrained( - model_name, - weights_path=weights_path, - advprop=is_advprop, - include_top=False, - ) + timm.create_model(model_name, pretrained=True, num_classes=0, checkpoint_path=weights_path) .eval() .to(self.device) ) self.feature_shape = self.get_shape() - self.is_advprop = is_advprop self.model_name = model_name @torch.no_grad() @@ -219,85 +205,77 @@ def get_shape(self) -> int: def forward(self, x) -> torch.Tensor: """Forward pass.""" out = self.model(x) - return out[:, :, 0, 0] + return out -class ImageDataset: - """Image Dataset Class.""" +class ImageTimmDataset: + """Image for Timm Dataset Class.""" def __init__( self, + model: TimmModelEmbedder, data: Sequence[str], - is_advprop: bool = True, loader: Callable = pil_loader, ): - """Pytorch Dataset for :class:`~lightautoml.image.EffNetImageEmbedder`. + """Pytorch Dataset for :class:`~lightautoml.image.TimmModelEmbedder`. Args: + model: model which we train. data: Sequence of paths. - is_advprop: Use adversarial training. loader: Callable for reading image from path. """ self.X = data - self.transforms = Compose( - [ - Resize(224, 224), - Normalize([0.5] * 3, [0.5] * 3) if is_advprop else Normalize(), - ToTensorV2(), - ] - ) + self.transforms = create_transform(**resolve_data_config(model.model.pretrained_cfg, model=model.model)) self.loader = loader def __getitem__(self, idx: int) -> np.ndarray: path = self.X[idx] - img = np.array(self.loader(path)) - img = self.transforms(image=img)["image"] + img = self.loader(path) + img = self.transforms(img) return img def __len__(self): return len(self.X) -class DeepImageEmbedder(TransformerMixin): - """Transformer for image embeddings.""" +class DeepTimmImageEmbedder(TransformerMixin): + """Timm Transformer for image embeddings.""" def __init__( self, device: torch.device = torch.device("cuda:0"), n_jobs=4, random_state=42, - is_advprop=True, - model_name="efficientnet-b0", + model_name="efficientnet_b0.ra_in1k", weights_path: Optional[str] = None, batch_size: int = 128, verbose: bool = True, ): - """Pytorch Dataset for :class:`~lightautoml.image.EffNetImageEmbedder`. + """Pytorch Dataset for :class:`~lightautoml.image.TimmModelEmbedder`. Args: device: Torch device. n_jobs: Number of threads for dataloader. random_state: Random seed. - is_advprop: Use adversarial training. model_name: Name of effnet model. weights_path: Path to saved weights. batch_size: Batch size. verbose: Verbose data processing. """ - super(DeepImageEmbedder, self).__init__() - assert model_name in {f"efficientnet-b{i}" for i in range(8)} + super(DeepTimmImageEmbedder, self).__init__() + # add assert to check model + # assert model_name in {f"efficientnet-b{i}" for i in range(8)} self.device, self.device_ids = parse_devices(device) self.random_state = random_state self.n_jobs = n_jobs - self.is_advprop = is_advprop self.batch_size = batch_size self.verbose = verbose seed_everything(random_state) - self.model = EffNetImageEmbedder(model_name, weights_path, self.is_advprop, self.device) + self.model = TimmModelEmbedder(model_name, weights_path, self.device) def fit(self, data: Any = None): """Train model.""" @@ -314,7 +292,7 @@ def transform(self, data: Sequence[str]) -> np.ndarray: Array of embeddings. """ - data = ImageDataset(data, self.is_advprop) + data = ImageTimmDataset(self.model, data) loader = DataLoader(data, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs) result = [] diff --git a/lightautoml/ml_algo/linear_sklearn.py b/lightautoml/ml_algo/linear_sklearn.py index a4737b1e..d6388298 100644 --- a/lightautoml/ml_algo/linear_sklearn.py +++ b/lightautoml/ml_algo/linear_sklearn.py @@ -83,7 +83,10 @@ def _infer_params(self) -> TorchBasedLinearEstimator: params = copy(self.params) params["loss"] = self.task.losses["torch"].loss params["metric"] = self.task.losses["torch"].metric_func - if self.task.name in ["binary", "multiclass", "multilabel"]: + + if self.task.name in ["multilabel"]: + model = TorchBasedLogisticRegression(output_size=self.n_classes, multilabel=True, **params) + elif self.task.name in ["binary", "multiclass"]: model = TorchBasedLogisticRegression(output_size=self.n_classes, **params) elif self.task.name == "reg": model = TorchBasedLinearRegression(output_size=1, **params) diff --git a/lightautoml/ml_algo/torch_based/linear_model.py b/lightautoml/ml_algo/torch_based/linear_model.py index 02c6d5a5..6321caf5 100644 --- a/lightautoml/ml_algo/torch_based/linear_model.py +++ b/lightautoml/ml_algo/torch_based/linear_model.py @@ -16,6 +16,7 @@ from torch import optim from ...tasks.losses import TorchLossWrapper +from ..utils import MySoftmaxClip logger = logging.getLogger(__name__) @@ -137,7 +138,7 @@ class CatMulticlass(CatLinear): def __init__(self, numeric_size: int, embed_sizes: Sequence[int] = (), output_size: int = 1): super().__init__(numeric_size, embed_sizes=embed_sizes, output_size=output_size) - self.final_act = nn.Softmax(dim=1) + self.final_act = MySoftmaxClip(dim=1) class TorchBasedLinearEstimator: @@ -438,6 +439,7 @@ class TorchBasedLogisticRegression(TorchBasedLinearEstimator): embed_sizes: categorical embedding sizes. output_size: size of output layer. cs: regularization coefficients. + multilabel: multilabel or not. max_iter: maximum iterations of L-BFGS. tol: the tolerance for the stopping criteria. early_stopping: maximum rounds without improving. @@ -470,21 +472,25 @@ def __init__( 10.0, 20.0, ), + multilabel: bool = False, max_iter: int = 1000, tol: float = 1e-4, early_stopping: int = 2, loss=Optional[Callable], metric=Optional[Callable], ): - if output_size == 1: - _loss = nn.BCELoss + if multilabel: + _loss = nn.BCEWithLogitsLoss + _model = CatLogisticRegression + self._binary = False + elif output_size == 1: + _loss = nn.BCEWithLogitsLoss _model = CatLogisticRegression self._binary = True else: _loss = nn.CrossEntropyLoss _model = CatMulticlass self._binary = False - if loss is None: loss = TorchLossWrapper(_loss) diff --git a/lightautoml/ml_algo/utils.py b/lightautoml/ml_algo/utils.py index 75a24108..3cbbd5e1 100644 --- a/lightautoml/ml_algo/utils.py +++ b/lightautoml/ml_algo/utils.py @@ -6,6 +6,11 @@ from typing import Optional from typing import Tuple +import torch.nn as nn + +from torch import Tensor +from torch import finfo + from ..dataset.base import LAMLDataset from ..validation.base import TrainValidIterator from .base import MLAlgo @@ -76,3 +81,33 @@ def tune_and_fit_predict( return None, None return ml_algo, preds + + +class MySoftmaxClip(nn.Module): + """Softmax with clip-norm. + + Args: + dim : A dimension along which Softmax will be computed (so every slice + along dim will sum to 1). + """ + + def __init__(self, dim: Optional[int] = None) -> None: + super(MySoftmaxClip, self).__init__() + self.dim = dim + self.smax = nn.Softmax(dim=dim) + + def forward(self, inputs: Tensor) -> Tensor: + """Inference phase. + + Args: + inputs: data to softmax and clip. + + Returns: + transformed values. + + """ + inputs = self.smax(inputs) + eps = 2 * finfo(inputs.dtype).eps + inputs = inputs.clip(eps, 1 - eps) + inputs /= inputs.sum(dim=self.dim)[:, None] + return inputs diff --git a/lightautoml/pipelines/features/image_pipeline.py b/lightautoml/pipelines/features/image_pipeline.py index c8d2ca29..f72a515e 100644 --- a/lightautoml/pipelines/features/image_pipeline.py +++ b/lightautoml/pipelines/features/image_pipeline.py @@ -33,12 +33,11 @@ def __init__(self, **kwargs: Any): self.n_jobs = 4 self.loader = pil_loader - self.embed_model = "efficientnet-b0" + self.embed_model = "efficientnet_b0.ra_in1k" self.weights_path = None self.subs = 10000 self.cache_dir = "../cache_CV" self.device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") - self.is_advprop = True self.batch_size = 128 self.verbose = True @@ -108,7 +107,6 @@ def create_pipeline(self, train: LAMLDataset) -> LAMLTransformer: self.device, self.n_jobs, self.random_state, - self.is_advprop, self.batch_size, self.verbose, ), diff --git a/lightautoml/tasks/common_metric.py b/lightautoml/tasks/common_metric.py index 8cab338b..65bc6dee 100644 --- a/lightautoml/tasks/common_metric.py +++ b/lightautoml/tasks/common_metric.py @@ -315,7 +315,7 @@ def __call__(self, y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Option _valid_str_binary_metric_names = { "auc": roc_auc_score, - "logloss": partial(log_loss, eps=1e-7), + "logloss": partial(log_loss), "accuracy": BestClassBinaryWrapper(accuracy_score), } @@ -333,7 +333,7 @@ def __call__(self, y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Option _valid_str_multiclass_metric_names = { "auc_mu": auc_mu, "auc": roc_auc_ovr, - "crossentropy": partial(log_loss, eps=1e-7), + "crossentropy": partial(log_loss), "accuracy": BestClassMulticlassWrapper(accuracy_score), "f1_macro": BestClassMulticlassWrapper(F1Factory("macro")), "f1_micro": BestClassMulticlassWrapper(F1Factory("micro")), @@ -341,7 +341,7 @@ def __call__(self, y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Option } _valid_str_multireg_metric_names = {"mse": mean_squared_error, "mae": mean_absolute_error} -_valid_str_multilabel_metric_names = {"logloss": partial(log_loss, eps=1e-7)} +_valid_str_multilabel_metric_names = {"logloss": partial(log_loss)} _valid_str_metric_names = { "binary": _valid_str_binary_metric_names, diff --git a/lightautoml/transformers/image.py b/lightautoml/transformers/image.py index 51985453..9d434ff2 100644 --- a/lightautoml/transformers/image.py +++ b/lightautoml/transformers/image.py @@ -19,7 +19,7 @@ from ..dataset.np_pd_dataset import PandasDataset from ..dataset.roles import NumericRole from ..image.image import CreateImageFeatures -from ..image.image import DeepImageEmbedder +from ..image.image import DeepTimmImageEmbedder from ..image.utils import pil_loader from ..text.utils import get_textarr_hash from ..text.utils import single_text_hash @@ -154,7 +154,6 @@ class AutoCVWrap(LAMLTransformer): device: Torch device. n_jobs: Number of threads for dataloader. random_state: Random state to take subsample and set torch seed. - is_advprop: Use adversarial training. batch_size: Batch size for embedding model. verbose: Verbose data processing. @@ -177,14 +176,13 @@ def features(self) -> List[str]: def __init__( self, - model="efficientnet-b0", + model="efficientnet_b0.ra_in1k", weights_path: Optional[str] = None, cache_dir: str = "./cache_CV", subs: Optional[Any] = None, device: torch.device = torch.device("cuda:0"), n_jobs: int = 4, random_state: int = 42, - is_advprop: bool = True, batch_size: int = 128, verbose: bool = True, ): @@ -194,11 +192,10 @@ def __init__( self.dicts = {} self.cache_dir = cache_dir - self.transformer = DeepImageEmbedder( + self.transformer = DeepTimmImageEmbedder( device, n_jobs, random_state, - is_advprop, model, weights_path, batch_size, diff --git a/pyproject.toml b/pyproject.toml index 15337165..85306808 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ catboost = ">=0.26.1" optuna = "*" torch = [ {platform = "win32", python = "3.6.1", version = "1.7.0"}, - {version = ">=1.0.0"} + {version = "<=2.0.0"} ] dataclasses = {version = "0.6", python = "<3.7"} holidays = "*" @@ -72,16 +72,16 @@ transformers = {version = ">=4", optional = true} # CV albumentations = {version = "<=1.0.3", optional = true} -efficientnet-pytorch = {version = "*", optional = true} +timm = {version = "*", optional = true} opencv-python = {version = "<=4.5.2.52", optional = true} PyWavelets = {version = "*", optional = true} torchvision = [ {platform = "win32", python = "3.6.1", version = "0.8.0", optional = true}, - {platform = "*", version = "*", optional = true} + {platform = "*", version = "<=0.14.0", optional = true} ] # AFG -featuretools = {version = ">=1.11.1", python = ">=3.7", optional = true} +featuretools = {version = ">=1.11.1", python = ">=3.8", optional = true} # Report (pdf) weasyprint = {version = "52.5", optional = true} @@ -91,7 +91,7 @@ cffi = {version = "1.14.5", optional = true} [tool.poetry.extras] cv = [ "albumentations", - "efficientnet-pytorch", + "timm", "opencv-python", "PyWavelets", "scikit-image", From a9d14660b143e67d1d8e1fb6c5bd63c026830a9f Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Thu, 3 Aug 2023 12:09:35 +0000 Subject: [PATCH 05/49] Added NODE neural network, added NODE example with tunning params --- .../Tutorial_9_neural_networks.ipynb | 186 +++++- lightautoml/automl/presets/tabular_presets.py | 12 +- lightautoml/ml_algo/dl_model.py | 2 + lightautoml/ml_algo/torch_based/nn_models.py | 69 ++- .../ml_algo/torch_based/node_nn_model.py | 531 ++++++++++++++++++ 5 files changed, 791 insertions(+), 9 deletions(-) create mode 100644 lightautoml/ml_algo/torch_based/node_nn_model.py diff --git a/examples/tutorials/Tutorial_9_neural_networks.ipynb b/examples/tutorials/Tutorial_9_neural_networks.ipynb index 0ee679d5..4224e3d3 100644 --- a/examples/tutorials/Tutorial_9_neural_networks.ipynb +++ b/examples/tutorials/Tutorial_9_neural_networks.ipynb @@ -118,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "2bea2ba9", "metadata": { "execution": { @@ -137,7 +137,40 @@ }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dvladimirvasilyev/anaconda3/envs/myenv/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dvladimirvasilyev/LightAutoML/lightautoml/ml_algo/dl_model.py:41: UserWarning: 'transformers' - package isn't installed\n", + " warnings.warn(\"'transformers' - package isn't installed\")\n", + "/home/dvladimirvasilyev/LightAutoML/lightautoml/text/nn_model.py:22: UserWarning: 'transformers' - package isn't installed\n", + " warnings.warn(\"'transformers' - package isn't installed\")\n", + "/home/dvladimirvasilyev/LightAutoML/lightautoml/text/dl_transformers.py:25: UserWarning: 'transformers' - package isn't installed\n", + " warnings.warn(\"'transformers' - package isn't installed\")\n" + ] + } + ], "source": [ "# Standard python libraries\n", "import os\n", @@ -187,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "64dfd5d0", "metadata": { "execution": { @@ -230,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "b8c3218d", "metadata": {}, "outputs": [], @@ -318,7 +351,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "fc3bd7a7", "metadata": { "execution": { @@ -710,7 +743,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "id": "343d7bac", "metadata": {}, "outputs": [], @@ -1507,6 +1540,145 @@ "automl.fit_predict(tr_data, roles = roles, verbose = 3)" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1000351d", + "metadata": {}, + "source": [ + "##### 4.2.3 One more example\n", + "##### Tuning NODE params" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fcbad7ce", + "metadata": {}, + "outputs": [], + "source": [ + "TIMEOUT = 3000" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a3bba8dc", + "metadata": {}, + "outputs": [], + "source": [ + "default_lama_params = {\n", + " \"task\": task, \n", + " \"timeout\": TIMEOUT,\n", + " \"cpu_limit\": N_THREADS,\n", + " \"reader_params\": {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}\n", + "}\n", + "\n", + "default_nn_params = {\n", + " \"bs\": 512, \"num_workers\": 0, \"path_to_save\": None, \"n_epochs\": 10, \"freeze_defaults\": True\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ec77132c", + "metadata": {}, + "outputs": [], + "source": [ + "def my_opt_space_NODE(trial: optuna.trial.Trial, estimated_n_trials, suggested_params):\n", + " ''' \n", + " This fucntion needs for paramer tuning\n", + " '''\n", + " # optionally\n", + " trial_values = copy(suggested_params)\n", + "\n", + " trial_values[\"layer_dim\"] = trial.suggest_categorical(\n", + " \"layer_dim\", [2 ** i for i in range(8, 10)]\n", + " )\n", + " trial_values[\"use_original_head\"] = trial.suggest_categorical(\n", + " \"use_original_head\", [True, False]\n", + " )\n", + " trial_values[\"num_layers\"] = trial.suggest_int(\n", + " \"num_layers\", 1, 3\n", + " )\n", + " trial_values[\"drop_rate\"] = trial.suggest_float(\n", + " \"drop_rate\", 0.0, 0.3\n", + " )\n", + " trial_values[\"tree_dim\"] = trial.suggest_int(\n", + " \"tree_dim\", 1, 3\n", + " )\n", + " return trial_values" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ba312d42", + "metadata": {}, + "outputs": [], + "source": [ + "automl = TabularAutoML(\n", + " task = task, \n", + " timeout = TIMEOUT,\n", + " cpu_limit = N_THREADS,\n", + " general_params = {\"use_algos\": [[\"node_tuned\"]]}, # ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model\n", + " nn_params = {\"n_epochs\": 10, \"bs\": 512, \"num_workers\": 0, \"path_to_save\": None, \"freeze_defaults\": True, \"optimization_search_space\": my_opt_space_NODE,},\n", + " nn_pipeline_params = {\"use_qnt\": True, \"use_te\": False},\n", + " reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "3df2104f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[11:58:03] Stdout logging level is INFO2.\n", + "[11:58:03] Task: binary\n", + "\n", + "[11:58:03] Start automl preset with listed constraints:\n", + "[11:58:03] - time: 3000.00 seconds\n", + "[11:58:03] - CPU: 4 cores\n", + "[11:58:03] - memory: 16 GB\n", + "\n", + "[11:58:03] \u001b[1mTrain data shape: (8000, 122)\u001b[0m\n", + "\n", + "[11:58:03] Layer \u001b[1m1\u001b[0m train process start. Time left 2999.19 secs\n", + "[11:58:04] Start hyperparameters optimization for \u001b[1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0\u001b[0m ... Time budget is 1574.27 secs\n", + "[12:01:57] Hyperparameters optimization for \u001b[1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0\u001b[0m completed\n", + "[12:01:57] The set of hyperparameters \u001b[1m{'num_workers': 0, 'pin_memory': False, 'max_length': 256, 'is_snap': False, 'input_bn': False, 'max_emb_size': 256, 'bert_name': None, 'pooling': 'cls', 'device': ['0'], 'use_cont': True, 'use_cat': True, 'use_text': False, 'lang': 'en', 'deterministic': True, 'multigpu': False, 'random_state': 42, 'model': 'node', 'model_with_emb': False, 'path_to_save': None, 'verbose_inside': None, 'verbose': 1, 'n_epochs': 10, 'snap_params': {'k': 3, 'early_stopping': True, 'patience': 10, 'swa': True}, 'bs': 512, 'emb_dropout': 0.1, 'emb_ratio': 3, 'opt': 'Adam', 'opt_params': {'lr': 0.0003, 'weight_decay': 0}, 'sch': 'ReduceLROnPlateau', 'scheduler_params': {'patience': 5, 'factor': 0.5, 'min_lr': 1e-05}, 'loss': None, 'loss_params': {}, 'loss_on_logits': True, 'clip_grad': False, 'clip_grad_params': {}, 'init_bias': True, 'dataset': 'UniversalDataset', 'tuned': False, 'optimization_search_space': , 'verbose_bar': False, 'freeze_defaults': True, 'n_out': None, 'hid_factor': [2, 2], 'hidden_size': [512, 512, 512], 'block_config': [2, 2], 'compression': 0.5, 'growth_size': 256, 'bn_factor': 2, 'drop_rate': 0.12034524690886754, 'noise_std': 0.05, 'num_init_features': None, 'act_fun': 'ReLU', 'use_noise': False, 'use_bn': True, 'stop_by_metric': False, 'tuning_params': {'fit_on_holdout': True, 'max_tuning_iter': 25, 'max_tuning_time': 3600}, 'layer_dim': 512, 'use_original_head': False, 'num_layers': 3, 'tree_dim': 2}\u001b[0m\n", + " achieve 0.7432 auc\n", + "[12:01:57] Start fitting \u001b[1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0\u001b[0m ...\n", + "[12:01:57] ===== Start working with \u001b[1mfold 0\u001b[0m for \u001b[1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0\u001b[0m =====\n", + "[12:02:09] ===== Start working with \u001b[1mfold 1\u001b[0m for \u001b[1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0\u001b[0m =====\n", + "[12:02:22] ===== Start working with \u001b[1mfold 2\u001b[0m for \u001b[1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0\u001b[0m =====\n", + "[12:02:34] ===== Start working with \u001b[1mfold 3\u001b[0m for \u001b[1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0\u001b[0m =====\n", + "[12:02:47] ===== Start working with \u001b[1mfold 4\u001b[0m for \u001b[1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0\u001b[0m =====\n", + "[12:02:59] Fitting \u001b[1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0\u001b[0m finished. score = \u001b[1m0.7146780211829931\u001b[0m\n", + "[12:02:59] \u001b[1mLvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0\u001b[0m fitting and predicting completed\n", + "[12:02:59] Time left 2703.40 secs\n", + "\n", + "[12:02:59] \u001b[1mLayer 1 training completed.\u001b[0m\n", + "\n", + "[12:02:59] \u001b[1mAutoml preset training completed in 296.61 seconds\u001b[0m\n", + "\n", + "[12:02:59] Model description:\n", + "Final prediction for new objects (level 0) = \n", + "\t 1.00000 * (5 averaged models Lvl_0_Pipe_0_Mod_0_Tuned_TorchNN_node_tuned_0) \n", + "\n" + ] + } + ], + "source": [ + "oof_pred = automl.fit_predict(tr_data, roles = roles, verbose = 2)" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -1689,7 +1861,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.17" }, "papermill": { "default_parameters": {}, diff --git a/lightautoml/automl/presets/tabular_presets.py b/lightautoml/automl/presets/tabular_presets.py index 8d208c39..7ab512e0 100644 --- a/lightautoml/automl/presets/tabular_presets.py +++ b/lightautoml/automl/presets/tabular_presets.py @@ -594,7 +594,17 @@ def create_automl(self, **fit_args): selector = pre_selector lvl.append(self.get_gbms(gbm_models, n + 1, selector)) - available_nn_models = ["nn", "mlp", "dense", "denselight", "resnet", "snn", "linear_layer", "_linear_layer"] + available_nn_models = [ + "nn", + "mlp", + "dense", + "denselight", + "resnet", + "snn", + "linear_layer", + "_linear_layer", + "node", + ] available_nn_models = available_nn_models + [x + "_tuned" for x in available_nn_models] nn_models = [ x for x in names if x in available_nn_models or (isinstance(x, type) and issubclass(x, nn.Module)) diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index ac1dae5d..8db9d7db 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -56,6 +56,7 @@ from ..text.utils import parse_devices from ..text.utils import seed_everything from .torch_based.nn_models import MLP +from .torch_based.nn_models import NODE from .torch_based.nn_models import SNN from .torch_based.nn_models import DenseLightModel from .torch_based.nn_models import DenseModel @@ -74,6 +75,7 @@ "linear_layer": LinearLayer, "_linear_layer": _LinearLayer, "snn": SNN, + "node": NODE, } diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index dbd42ec6..119e0779 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -9,6 +9,9 @@ import torch import torch.nn as nn +from lightautoml.ml_algo.torch_based.node_nn_model import DenseODSTBlock +from lightautoml.ml_algo.torch_based.node_nn_model import Lambda + class GaussianNoise(nn.Module): """Adds gaussian noise. @@ -389,7 +392,6 @@ class DenseModel(nn.Module): bn_factor: Dim of intermediate fc is increased times `bn_factor` in DenseModel layer. act_fun: Activation function. use_bn: Use BatchNorm. - """ def __init__( @@ -729,3 +731,68 @@ def __init__(self): def forward(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: """Forward-pass.""" return x + + +class NODE(nn.Module): + """The NODE model from https://github.com/Qwicen. + + Args: + n_in: Input dim. + n_out: Output dim. + layer_dim: num trees in one layer. + num_layers: number of forests. + tree_dim: number of response channels in the response of individual tree. + use_original_head use averaging as a head or put linear layer instead. + depth: number of splits in every tree. + drop_rate: Dropout rate for each layer altogether. + act_fun: Activation function. + num_init_features: If not none add fc layer before model with certain dim. + use_bn: Use BatchNorm. + """ + + def __init__( + self, + n_in: int, + n_out: int = 1, + layer_dim: int = 2048, + num_layers: int = 1, + tree_dim: int = 1, + use_original_head: bool = False, + depth: int = 6, + drop_rate: float = 0.0, + act_fun: nn.Module = nn.ReLU, + num_init_features: Optional[int] = None, + use_bn: bool = True, + **kwargs, + ): + super(NODE, self).__init__() + num_features = n_in if num_init_features is None else num_init_features + self.dense0 = nn.Linear(n_in, num_features) if num_init_features is not None else nn.Identity() + self.features1 = nn.Sequential(OrderedDict([])) + block = DenseODSTBlock( + input_dim=num_features, + layer_dim=layer_dim, + num_layers=num_layers, + tree_dim=tree_dim if not use_original_head else n_out, + depth=depth, + input_dropout=drop_rate, + flatten_output=not use_original_head, + ) + self.features1.add_module("ODSTForestblock%d", block) + self.features2 = nn.Sequential(OrderedDict([])) + if use_original_head: + last_layer = Lambda(lambda x: x[..., :n_out].mean(dim=-2)) + self.features2.add_module("head", last_layer) + else: + if use_bn: + self.features2.add_module("norm", nn.BatchNorm1d(layer_dim * num_layers * tree_dim)) + self.features2.add_module("act", act_fun()) + fc = nn.Linear(layer_dim * num_layers * tree_dim, n_out) + self.features2.add_module("fc", fc) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward-pass.""" + x = self.dense0(x) + x = self.features1(x) + x = self.features2(x) + return x.view(x.shape[0], -1) diff --git a/lightautoml/ml_algo/torch_based/node_nn_model.py b/lightautoml/ml_algo/torch_based/node_nn_model.py new file mode 100644 index 00000000..cdfedbea --- /dev/null +++ b/lightautoml/ml_algo/torch_based/node_nn_model.py @@ -0,0 +1,531 @@ +"""Node utils models.""" + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from torch.autograd import Function +from torch.jit import script + + +def check_numpy(x): + """Makes sure x is a numpy array. + + Args: + x : array to check. + + Returns: + x + """ + if isinstance(x, torch.Tensor): + x = x.detach().cpu().numpy() + x = np.asarray(x) + assert isinstance(x, np.ndarray) + return x + + +def to_one_hot(y, depth=None): + """Takes integer with n dims and converts it to 1-hot representation with n + 1 dims. + + The n+1'st dimension will have zeros everywhere but at y'th index, where it will be equal to 1. + + Args: + y : input integer (IntTensor, LongTensor or Variable) of any shape + depth : the size of the one hot dimension + + Returns: + one hot Tensor + """ + y_flat = y.to(torch.int64).view(-1, 1) + depth = depth if depth is not None else int(torch.max(y_flat)) + 1 + y_one_hot = torch.zeros(y_flat.size()[0], depth, device=y.device).scatter_(1, y_flat, 1) + y_one_hot = y_one_hot.view(*(tuple(y.shape) + (-1,))) + return y_one_hot + + +def _make_ix_like(input, dim=0): + d = input.size(dim) + rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) + view = [1] * input.dim() + view[0] = -1 + return rho.view(view).transpose(0, dim) + + +class SparsemaxFunction(Function): + """An implementation of sparsemax (Martins & Astudillo, 2016). + + See :cite:`DBLP:journals/corr/MartinsA16` for detailed description. + By Ben Peters and Vlad Niculae + """ + + @staticmethod + def forward(ctx, input, dim=-1): + """sparsemax: normalizing sparse transform (a la softmax). + + Args: + ctx: context, to increase the speed + input (Tensor): any shape + dim: dimension along which to apply sparsemax + + Returns: + Tensor same shape as input + """ + ctx.dim = dim + max_val, _ = input.max(dim=dim, keepdim=True) + input -= max_val # same numerical stability trick as for softmax + tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim) + output = torch.clamp(input - tau, min=0) + ctx.save_for_backward(supp_size, output) + return output + + @staticmethod + def backward(ctx, grad_output): + """backward-pass. + + Args: + ctx: context, to increase the speed + grad_output: grad from the next layers + + Returns: + grad output + """ + supp_size, output = ctx.saved_tensors + dim = ctx.dim + grad_input = grad_output.clone() + grad_input[output == 0] = 0 + + v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() + v_hat = v_hat.unsqueeze(dim) + grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) + return grad_input, None + + @staticmethod + def _threshold_and_support(input, dim=-1): + """Sparsemax building block compute the threshold. + + Args: + input: any dimension + dim: dimension along which to apply the sparsemax + + Returns: + the threshold value + """ + input_srt, _ = torch.sort(input, descending=True, dim=dim) + input_cumsum = input_srt.cumsum(dim) - 1 + rhos = _make_ix_like(input, dim) + support = rhos * input_srt > input_cumsum + + support_size = support.sum(dim=dim).unsqueeze(dim) + tau = input_cumsum.gather(dim, support_size - 1) + tau /= support_size.to(input.dtype) + return tau, support_size + + +sparsemax = lambda input, dim=-1: SparsemaxFunction.apply(input, dim) # noqa: E731 +sparsemoid = lambda input: (0.5 * input + 0.5).clamp_(0, 1) # noqa: E731 + + +class Entmax15Function(Function): + """An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). + + See :cite:`https://arxiv.org/abs/1905.05702 for detailed description. + Source: https://github.com/deep-spin/entmax + """ + + @staticmethod + def forward(ctx, input, dim=-1): + """Entmax: normalizing sparse transform (a la softmax). + + Args: + ctx: context, to increase the speed + input (Tensor): any shape + dim: dimension along which to apply Entmax + + Returns: + output (Tensor): same shape as input + """ + ctx.dim = dim + max_val, _ = input.max(dim=dim, keepdim=True) + input = input - max_val # same numerical stability trick as for softmax + input = input / 2 # divide by 2 to solve actual Entmax + + tau_star, _ = Entmax15Function._threshold_and_support(input, dim) + output = torch.clamp(input - tau_star, min=0) ** 2 + ctx.save_for_backward(output) + return output + + @staticmethod + def backward(ctx, grad_output): + """backward-pass. + + Args: + ctx: context, to increase the speed + grad_output: grad from the next layers + + Returns: + grad output + """ + (Y,) = ctx.saved_tensors + gppr = Y.sqrt() # = 1 / g'' (Y) + dX = grad_output * gppr + q = dX.sum(ctx.dim) / gppr.sum(ctx.dim) + q = q.unsqueeze(ctx.dim) + dX -= q * gppr + return dX, None + + @staticmethod + def _threshold_and_support(input, dim=-1): + """Sparsemax building block compute the threshold. + + Args: + input: any dimension + dim: dimension along which to apply the sparsemax + + Returns: + the threshold value + """ + Xsrt, _ = torch.sort(input, descending=True, dim=dim) + + rho = _make_ix_like(input, dim) + mean = Xsrt.cumsum(dim) / rho + mean_sq = (Xsrt ** 2).cumsum(dim) / rho + ss = rho * (mean_sq - mean ** 2) + delta = (1 - ss) / rho + + # NOTE this is not exactly the same as in reference algo + # Fortunately it seems the clamped values never wrongly + # get selected by tau <= sorted_z. Prove this! + delta_nz = torch.clamp(delta, 0) + tau = mean - torch.sqrt(delta_nz) + + support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim) + tau_star = tau.gather(dim, support_size - 1) + return tau_star, support_size + + +class Entmoid15(Function): + """A highly optimized equivalent of labda x: Entmax15([x, 0]).""" + + @staticmethod + def forward(ctx, input): + """Entmoid15 (a la softmax). + + Args: + ctx: context, to increase the speed + input (Tensor): any shape + + Returns: + output (Tensor): same shape as input + """ + output = Entmoid15._forward(input) + ctx.save_for_backward(output) + return output + + @staticmethod + @script + def _forward(input): + input, is_pos = abs(input), input >= 0 + tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2 + tau.masked_fill_(tau <= input, 2.0) + y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2 + return torch.where(is_pos, 1 - y_neg, y_neg) + + @staticmethod + def backward(ctx, grad_output): + """backward-pass. + + Args: + ctx: context, to increase the speed + grad_output: grad from the next layers + + Returns: + grad output + """ + return Entmoid15._backward(ctx.saved_tensors[0], grad_output) + + @staticmethod + @script + def _backward(output, grad_output): + gppr0, gppr1 = output.sqrt(), (1 - output).sqrt() + grad_input = grad_output * gppr0 + q = grad_input / (gppr0 + gppr1) + grad_input -= q * gppr0 + return grad_input + + +entmax15 = lambda input, dim=-1: Entmax15Function.apply(input, dim) # noqa: E731 +entmoid15 = Entmoid15.apply # noqa: E731 + + +class Lambda(nn.Module): + """Pytorch implementation of lambda. + + Args: + func : returned func + """ + + def __init__(self, func): + super().__init__() + self.func = func + + def forward(self, *args, **kwargs): + """Forward-pass. + + # noqa: DAR101 + + Returns: + f(*args, **kwargs) + """ + return self.func(*args, **kwargs) + + +class ModuleWithInit(nn.Module): + """Base class for pytorch module with data-aware initializer on first batch.""" + + def __init__(self): + super().__init__() + self._is_initialized_tensor = nn.Parameter(torch.tensor(0, dtype=torch.uint8), requires_grad=False) + self._is_initialized_bool = None + # Note: this module uses a separate flag self._is_initialized so as to achieve both + # * persistence: is_initialized is saved alongside model in state_dict + # * speed: model doesn't need to cache + # please DO NOT use these flags in child modules + + def initialize(self, *args, **kwargs): + """Initialize module tensors using first batch of data.""" + raise NotImplementedError("Please implement ") + + def __call__(self, *args, **kwargs): + """Initialize module after forward-pass. + + # noqa: DAR101 + + Returns: + Forward-pass. + """ + if self._is_initialized_bool is None: + self._is_initialized_bool = bool(self._is_initialized_tensor.item()) + if not self._is_initialized_bool: + self.initialize(*args, **kwargs) + self._is_initialized_tensor.data[...] = 1 + self._is_initialized_bool = True + return super().__call__(*args, **kwargs) + + +class ODST(ModuleWithInit): + r"""Oblivious Differentiable Sparsemax Trees. http://tinyurl.com/odst-readmore. + + One can drop (sic!) this module anywhere instead of nn.Linear + + Args: + in_features: number of features in the input tensor + num_trees: number of trees in this layer + tree_dim: number of response channels in the response of individual tree + depth: number of splits in every tree + flatten_output: if False, returns [..., num_trees, tree_dim], + by default returns [..., num_trees * tree_dim] + choice_function: f(tensor, dim) -> R_simplex computes feature weights s.t. f(tensor, dim).sum(dim) == 1 + bin_function: f(tensor) -> R[0, 1], computes tree leaf weights + initialize_response_: in-place initializer for tree output tensor + initialize_selection_logits_: in-place initializer for logits that select features for the tree + both thresholds and scales are initialized with data-aware init (or .load_state_dict) + threshold_init_beta: initializes threshold to a q-th quantile of data points + where q ~ Beta(:threshold_init_beta:, :threshold_init_beta:) + If this param is set to 1, initial thresholds will have the same distribution as data points + If greater than 1 (e.g. 10), thresholds will be closer to median data value + If less than 1 (e.g. 0.1), thresholds will approach min/max data values. + threshold_init_cutoff: threshold log-temperatures initializer, \in (0, inf) + By default(1.0), log-remperatures are initialized in such a way that all bin selectors + end up in the linear region of sparse-sigmoid. The temperatures are then scaled by this parameter. + Setting this value > 1.0 will result in some margin between data points and sparse-sigmoid cutoff value + Setting this value < 1.0 will cause (1 - value) part of data points to end up in flat sparse-sigmoid region + For instance, threshold_init_cutoff = 0.9 will set 10% points equal to 0.0 or 1.0 + Setting this value > 1.0 will result in a margin between data points and sparse-sigmoid cutoff value + All points will be between (0.5 - 0.5 / threshold_init_cutoff) and (0.5 + 0.5 / threshold_init_cutoff) + """ + + def __init__( + self, + in_features, + num_trees, + depth=6, + tree_dim=1, + flatten_output=True, + choice_function=entmax15, + bin_function=entmoid15, + initialize_response_=nn.init.normal_, + initialize_selection_logits_=nn.init.uniform_, + threshold_init_beta=1.0, + threshold_init_cutoff=1.0, + ): + super().__init__() + self.depth, self.num_trees, self.tree_dim, self.flatten_output = depth, num_trees, tree_dim, flatten_output + self.choice_function, self.bin_function = choice_function, bin_function + self.threshold_init_beta, self.threshold_init_cutoff = threshold_init_beta, threshold_init_cutoff + + self.response = nn.Parameter(torch.zeros([num_trees, tree_dim, 2 ** depth]), requires_grad=True) + initialize_response_(self.response) + + self.feature_selection_logits = nn.Parameter(torch.zeros([in_features, num_trees, depth]), requires_grad=True) + initialize_selection_logits_(self.feature_selection_logits) + + self.feature_thresholds = nn.Parameter( + torch.full([num_trees, depth], float("nan"), dtype=torch.float32), requires_grad=True + ) # nan values will be initialized on first batch (data-aware init) + + self.log_temperatures = nn.Parameter( + torch.full([num_trees, depth], float("nan"), dtype=torch.float32), requires_grad=True + ) + + # binary codes for mapping between 1-hot vectors and bin indices + with torch.no_grad(): + indices = torch.arange(2 ** self.depth) + offsets = 2 ** torch.arange(self.depth) + bin_codes = (indices.view(1, -1) // offsets.view(-1, 1) % 2).to(torch.float32) + bin_codes_1hot = torch.stack([bin_codes, 1.0 - bin_codes], dim=-1) + self.bin_codes_1hot = nn.Parameter(bin_codes_1hot, requires_grad=False) + # ^-- [depth, 2 ** depth, 2] + + def forward(self, input): + """Forward-pass. + + Args: + input: any shape + + Returns: + response + """ + assert len(input.shape) >= 2 + if len(input.shape) > 2: + return self.forward(input.view(-1, input.shape[-1])).view(*input.shape[:-1], -1) + # new input shape: [batch_size, in_features] + + feature_logits = self.feature_selection_logits + feature_selectors = self.choice_function(feature_logits, dim=0) + # ^--[in_features, num_trees, depth] + + feature_values = torch.einsum("bi,ind->bnd", input, feature_selectors) + # ^--[batch_size, num_trees, depth] + + threshold_logits = (feature_values - self.feature_thresholds) * torch.exp(-self.log_temperatures) + + threshold_logits = torch.stack([-threshold_logits, threshold_logits], dim=-1) + # ^--[batch_size, num_trees, depth, 2] + + bins = self.bin_function(threshold_logits) + # ^--[batch_size, num_trees, depth, 2], approximately binary + + bin_matches = torch.einsum("btds,dcs->btdc", bins, self.bin_codes_1hot) + # ^--[batch_size, num_trees, depth, 2 ** depth] + + response_weights = torch.prod(bin_matches, dim=-2) + # ^-- [batch_size, num_trees, 2 ** depth] + + response = torch.einsum("bnd,ncd->bnc", response_weights, self.response) + # ^-- [batch_size, num_trees, tree_dim] + + return response.flatten(1, 2) if self.flatten_output else response + + def initialize(self, input, eps=1e-6): + """Initialization. + + Args: + input: any dimension + eps: extra epsilon as a temperature + """ + # data-aware initializer + assert len(input.shape) == 2 + with torch.no_grad(): + feature_selectors = self.choice_function(self.feature_selection_logits, dim=0) + # ^--[in_features, num_trees, depth] + + feature_values = torch.einsum("bi,ind->bnd", input, feature_selectors) + # ^--[batch_size, num_trees, depth] + + # initialize thresholds: sample random percentiles of data + percentiles_q = 100 * np.random.beta( + self.threshold_init_beta, self.threshold_init_beta, size=[self.num_trees, self.depth] + ) + self.feature_thresholds.data[...] = torch.as_tensor( + list(map(np.percentile, check_numpy(feature_values.flatten(1, 2).t()), percentiles_q.flatten())), + dtype=feature_values.dtype, + device=feature_values.device, + ).view(self.num_trees, self.depth) + + # init temperatures: make sure enough data points are in the linear region of sparse-sigmoid + temperatures = np.percentile( + check_numpy(abs(feature_values - self.feature_thresholds)), + q=100 * min(1.0, self.threshold_init_cutoff), + axis=0, + ) + + # if threshold_init_cutoff > 1, scale everything down by it + temperatures /= max(1.0, self.threshold_init_cutoff) + self.log_temperatures.data[...] = torch.log(torch.as_tensor(temperatures) + eps) + + def __repr__(self): + return "{}(in_features={}, num_trees={}, depth={}, tree_dim={}, flatten_output={})".format( + self.__class__.__name__, + self.feature_selection_logits.shape[0], + self.num_trees, + self.depth, + self.tree_dim, + self.flatten_output, + ) + + +class DenseODSTBlock(nn.Sequential): + """The DenseBlock from https://github.com/Qwicen. + + Args: + sinput_dim: Input dim. + layer_dim: num trees in one layer. + num_layers: number of forests. + tree_dim: number of response channels in the response of individual tree. + max_features: maximum number of features per input + depth: number of splits in every tree. + input_dropout: Dropout rate forest layer. + flatten_output: flatten output or not. + """ + + def __init__( + self, + input_dim, + layer_dim, + num_layers, + tree_dim=1, + max_features=None, + input_dropout=0.0, + flatten_output=True, + **kwargs + ): + layers = [] + for i in range(num_layers): + oddt = ODST(input_dim, layer_dim, tree_dim=tree_dim, flatten_output=True, **kwargs) + input_dim = min(input_dim + layer_dim * tree_dim, max_features or float("inf")) + layers.append(oddt) + + super().__init__(*layers) + self.num_layers, self.layer_dim, self.tree_dim = num_layers, layer_dim, tree_dim + self.max_features, self.flatten_output = max_features, flatten_output + self.input_dropout = input_dropout + + def forward(self, x): + """Forward-pass.""" + initial_features = x.shape[-1] + for layer in self: + layer_inp = x + if self.max_features is not None: + tail_features = min(self.max_features, layer_inp.shape[-1]) - initial_features + if tail_features != 0: + layer_inp = torch.cat([layer_inp[..., :initial_features], layer_inp[..., -tail_features:]], dim=-1) + if self.input_dropout: + layer_inp = F.dropout(layer_inp, self.input_dropout, self.training) + h = layer(layer_inp) + x = torch.cat([x, h], dim=-1) + + outputs = x[..., initial_features:] + if not self.flatten_output: + outputs = outputs.view(*outputs.shape[:-1], self.num_layers * self.layer_dim, self.tree_dim) + return outputs From 6415c28f62178c713c3425b8820c049123a25709 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Thu, 3 Aug 2023 14:29:59 +0000 Subject: [PATCH 06/49] added tutorial run --- examples/tutorials/Tutorial_8_CV_preset.ipynb | 1440 +++++++++-------- 1 file changed, 781 insertions(+), 659 deletions(-) diff --git a/examples/tutorials/Tutorial_8_CV_preset.ipynb b/examples/tutorials/Tutorial_8_CV_preset.ipynb index 06a4f833..8a946476 100644 --- a/examples/tutorials/Tutorial_8_CV_preset.ipynb +++ b/examples/tutorials/Tutorial_8_CV_preset.ipynb @@ -62,23 +62,21 @@ "outputs": [], "source": [ "##Kaggle functionality for loading data; Note that you have to use your kaggle API token (see the link above):\n", - "#!pip install opendatasets\n", - "#!pip install -q kaggle\n", - "#!pip install --upgrade --force-reinstall --no-deps kaggle\n", - "#!mkdir ~/.kaggle\n", - "#!ls ~/.kaggle\n", - "#!cp kaggle.json ~/.kaggle/\n", - "#!chmod 600 ~/.kaggle/kaggle.json\n", - "#!kaggle competitions download -c paddy-disease-classification\n", + "# !pip install opendatasets\n", + "# !pip install -q kaggle\n", + "# !pip install --upgrade --force-reinstall --no-deps kaggle\n", + "# !mkdir ~/.kaggle\n", + "# !ls ~/.kaggle\n", + "# !cp kaggle.json ~/.kaggle/\n", + "# !chmod 600 ~/.kaggle/kaggle.json\n", + "# !kaggle competitions download -c paddy-disease-classification\n", "\n", - "##Unpack data:\n", - "#!mkdir paddy-disease\n", - "#!unzip paddy-disease-classification.zip -d paddy-disease\n", + "# #Unpack data:\n", + "# !mkdir paddy-disease\n", + "# !unzip paddy-disease-classification.zip -d paddy-disease\n", "\n", - "##Install LightAutoML, Pandas and torch EfficientNet:\n", - "#!pip install -U lightautoml[cv] #[cv] is for installing CV tasks functionality\n", - "#!pip install efficientnet-pytorch==0.7.0\n", - "#!pip install -U pandas" + "# #Install LightAutoML, Pandas and torch EfficientNet:\n", + "# !pip install -U lightautoml[cv] #[cv] is for installing CV tasks functionality\n" ] }, { @@ -102,6 +100,17 @@ "- LightAutoML modules: `TabularCVAutoML` preset for AutoML model creation and Task class to setup what kind of ML problem we solve (binary/multiclass classification or regression)" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -122,7 +131,32 @@ }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n", + "'nlp' extra dependecy package 'transformers' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dvladimirvasilyev/LightAutoML/lightautoml/ml_algo/dl_model.py:41: UserWarning: 'transformers' - package isn't installed\n", + " warnings.warn(\"'transformers' - package isn't installed\")\n", + "/home/dvladimirvasilyev/LightAutoML/lightautoml/text/nn_model.py:22: UserWarning: 'transformers' - package isn't installed\n", + " warnings.warn(\"'transformers' - package isn't installed\")\n", + "/home/dvladimirvasilyev/LightAutoML/lightautoml/text/dl_transformers.py:25: UserWarning: 'transformers' - package isn't installed\n", + " warnings.warn(\"'transformers' - package isn't installed\")\n" + ] + } + ], "source": [ "# Standard python libraries\n", "import os\n", @@ -443,8 +477,8 @@ "77 42\n", "73 38\n", "66 36\n", - "82 5\n", "62 5\n", + "82 5\n", "Name: age, dtype: int64" ] }, @@ -576,8 +610,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 10.5 ms, sys: 1.28 ms, total: 11.8 ms\n", - "Wall time: 10.6 ms\n" + "CPU times: user 4.89 ms, sys: 485 µs, total: 5.37 ms\n", + "Wall time: 5.14 ms\n" ] }, { @@ -783,7 +817,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -797,6 +831,20 @@ "scrolled": true }, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c0e2e2174b1644ed91ed76b5f30a6d6e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", @@ -817,10 +865,10 @@ ], "source": [ "from PIL import Image\n", - "\n", + "from tqdm.notebook import tqdm\n", "new_imgs = []\n", "\n", - "for i, p in enumerate(train_data['path'].values):\n", + "for i, p in tqdm(enumerate(train_data['path'].values)):\n", " if i % 1000 == 0: \n", " print(i)\n", " \n", @@ -1013,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -1025,6 +1073,20 @@ "execution_count": 14, "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2d6a6c9a493b4a6298da4543f3ed3dba", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", @@ -1039,7 +1101,7 @@ "source": [ "new_imgs = []\n", "\n", - "for i, p in enumerate(submission['path'].values):\n", + "for i, p in tqdm(enumerate(submission['path'].values)):\n", " if i % 1000 == 0: \n", " print(i)\n", " \n", @@ -1365,117 +1427,143 @@ "name": "stdout", "output_type": "stream", "text": [ - "[10:13:29] Stdout logging level is INFO3.\n", - "[10:13:29] Task: multiclass\n", + "[14:04:32] Stdout logging level is INFO3.\n", + "[14:04:32] Task: multiclass\n", "\n", - "[10:13:29] Start automl preset with listed constraints:\n", - "[10:13:29] - time: 18000.00 seconds\n", - "[10:13:29] - CPU: 2 cores\n", - "[10:13:29] - memory: 16 GB\n", + "[14:04:32] Start automl preset with listed constraints:\n", + "[14:04:32] - time: 18000.00 seconds\n", + "[14:04:32] - CPU: 2 cores\n", + "[14:04:32] - memory: 16 GB\n", "\n", - "[10:13:29] Train data shape: (114477, 5)\n", + "[14:04:32] \u001b[1mTrain data shape: (114477, 5)\u001b[0m\n", "\n", - "[10:13:29] Layer 1 train process start. Time left 17999.80 secs\n", - "Loaded pretrained weights for efficientnet-b0\n", - "[10:13:33] Load saved dataset for path\n", - "[10:13:34] Feature path transformed\n", - "[10:13:43] Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...\n", - "[10:13:43] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n", - "[10:13:57] Linear model: C = 1e-05 score = -1.1418084988175383\n", - "[10:14:11] Linear model: C = 5e-05 score = -0.8305336454784469\n", - "[10:14:20] Linear model: C = 0.0001 score = -0.720568943030616\n", - "[10:14:33] Linear model: C = 0.0005 score = -0.5286940477535328\n", - "[10:14:46] Linear model: C = 0.001 score = -0.4742702118116027\n", - "[10:15:05] Linear model: C = 0.005 score = -0.4115479073137628\n", - "[10:15:22] Linear model: C = 0.01 score = -0.4116697343411257\n", - "[10:15:37] Linear model: C = 0.05 score = -0.4708596346733632\n", - "[10:15:37] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n", - "[10:15:53] Linear model: C = 1e-05 score = -1.125846835364122\n", - "[10:16:06] Linear model: C = 5e-05 score = -0.8160193296553417\n", - "[10:16:15] Linear model: C = 0.0001 score = -0.7059358171644057\n", - "[10:16:30] Linear model: C = 0.0005 score = -0.519242546498812\n", - "[10:16:41] Linear model: C = 0.001 score = -0.46901655981859697\n", - "[10:16:57] Linear model: C = 0.005 score = -0.41414562408622063\n", - "[10:17:11] Linear model: C = 0.01 score = -0.41384713476625173\n", - "[10:17:27] Linear model: C = 0.05 score = -0.4648754680980122\n", - "[10:17:43] Linear model: C = 0.1 score = -0.5075886657099099\n", - "[10:17:44] ===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n", - "[10:17:59] Linear model: C = 1e-05 score = -1.1051242355426971\n", - "[10:18:13] Linear model: C = 5e-05 score = -0.8001807308024304\n", - "[10:18:22] Linear model: C = 0.0001 score = -0.694614750733295\n", - "[10:18:37] Linear model: C = 0.0005 score = -0.5152255270514708\n", - "[10:18:49] Linear model: C = 0.001 score = -0.4661388869197108\n", - "[10:19:04] Linear model: C = 0.005 score = -0.41149006525348847\n", - "[10:19:19] Linear model: C = 0.01 score = -0.4104284110463969\n", - "[10:19:35] Linear model: C = 0.05 score = -0.45961952123715527\n", - "[10:19:51] Linear model: C = 0.1 score = -0.5011072775518325\n", - "[10:19:51] ===== Start working with fold 3 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n", - "[10:20:09] Linear model: C = 1e-05 score = -1.1144246553258361\n", - "[10:20:25] Linear model: C = 5e-05 score = -0.8084470717185533\n", - "[10:20:35] Linear model: C = 0.0001 score = -0.7023525467007014\n", - "[10:20:49] Linear model: C = 0.0005 score = -0.5214177088196867\n", - "[10:21:01] Linear model: C = 0.001 score = -0.4720856273082093\n", - "[10:21:15] Linear model: C = 0.005 score = -0.4191401085852046\n", - "[10:21:32] Linear model: C = 0.01 score = -0.4208859924287323\n", - "[10:21:47] Linear model: C = 0.05 score = -0.4851827484977867\n", - "[10:21:48] ===== Start working with fold 4 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n", - "[10:22:04] Linear model: C = 1e-05 score = -1.0955685661573173\n", - "[10:22:17] Linear model: C = 5e-05 score = -0.7741691221828721\n", - "[10:22:26] Linear model: C = 0.0001 score = -0.6653917443236547\n", - "[10:22:42] Linear model: C = 0.0005 score = -0.48628652950179174\n", - "[10:22:53] Linear model: C = 0.001 score = -0.4381206233809863\n", - "[10:23:09] Linear model: C = 0.005 score = -0.38322203539072797\n", - "[10:23:24] Linear model: C = 0.01 score = -0.38175313554732276\n", - "[10:23:40] Linear model: C = 0.05 score = -0.42856031627342633\n", - "[10:23:55] Linear model: C = 0.1 score = -0.4689137362889697\n", - "[10:23:55] Fitting Lvl_0_Pipe_0_Mod_0_LinearL2 finished. score = -0.4073443684095255\n", - "[10:23:55] Lvl_0_Pipe_0_Mod_0_LinearL2 fitting and predicting completed\n", - "[10:23:55] Time left 17373.85 secs\n", + "[14:04:32] Layer \u001b[1m1\u001b[0m train process start. Time left 17999.83 secs\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 895/895 [07:29<00:00, 1.99it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[14:12:09] Feature path transformed\n", + "[14:12:16] Start fitting \u001b[1mLvl_0_Pipe_0_Mod_0_LinearL2\u001b[0m ...\n", + "[14:12:17] ===== Start working with \u001b[1mfold 0\u001b[0m for \u001b[1mLvl_0_Pipe_0_Mod_0_LinearL2\u001b[0m =====\n", + "[14:12:26] Linear model: C = 1e-05 score = -0.9995305866945853\n", + "[14:12:32] Linear model: C = 5e-05 score = -0.6879959560713191\n", + "[14:12:38] Linear model: C = 0.0001 score = -0.5802952177399445\n", + "[14:12:45] Linear model: C = 0.0005 score = -0.3907926611544111\n", + "[14:12:51] Linear model: C = 0.001 score = -0.33425017155675657\n", + "[14:13:00] Linear model: C = 0.005 score = -0.2559518217619532\n", + "[14:13:07] Linear model: C = 0.01 score = -0.24141776919439237\n", + "[14:13:15] Linear model: C = 0.05 score = -0.2431661172897411\n", + "[14:13:23] Linear model: C = 0.1 score = -0.25925367786528475\n", + "[14:13:24] ===== Start working with \u001b[1mfold 1\u001b[0m for \u001b[1mLvl_0_Pipe_0_Mod_0_LinearL2\u001b[0m =====\n", + "[14:13:32] Linear model: C = 1e-05 score = -0.9872444001968863\n", + "[14:13:39] Linear model: C = 5e-05 score = -0.6682540100549987\n", + "[14:13:45] Linear model: C = 0.0001 score = -0.5574685730009872\n", + "[14:13:51] Linear model: C = 0.0005 score = -0.3653461360638747\n", + "[14:13:58] Linear model: C = 0.001 score = -0.31059360297670363\n", + "[14:14:05] Linear model: C = 0.005 score = -0.2370436682635623\n", + "[14:14:14] Linear model: C = 0.01 score = -0.22495884629469698\n", + "[14:14:21] Linear model: C = 0.05 score = -0.23420873784566962\n", + "[14:14:29] Linear model: C = 0.1 score = -0.25263966927426823\n", + "[14:14:29] ===== Start working with \u001b[1mfold 2\u001b[0m for \u001b[1mLvl_0_Pipe_0_Mod_0_LinearL2\u001b[0m =====\n", + "[14:14:37] Linear model: C = 1e-05 score = -0.9554531133528031\n", + "[14:14:43] Linear model: C = 5e-05 score = -0.640784196156178\n", + "[14:14:49] Linear model: C = 0.0001 score = -0.5345024606190905\n", + "[14:14:57] Linear model: C = 0.0005 score = -0.3546726337461952\n", + "[14:15:04] Linear model: C = 0.001 score = -0.30344210801693483\n", + "[14:15:12] Linear model: C = 0.005 score = -0.2331574262775805\n", + "[14:15:19] Linear model: C = 0.01 score = -0.22071779776854528\n", + "[14:15:28] Linear model: C = 0.05 score = -0.22603075278344578\n", + "[14:15:36] Linear model: C = 0.1 score = -0.24138537694410292\n", + "[14:15:36] ===== Start working with \u001b[1mfold 3\u001b[0m for \u001b[1mLvl_0_Pipe_0_Mod_0_LinearL2\u001b[0m =====\n", + "[14:15:44] Linear model: C = 1e-05 score = -0.973115505822288\n", + "[14:15:51] Linear model: C = 5e-05 score = -0.6613476137718094\n", + "[14:15:56] Linear model: C = 0.0001 score = -0.5539538946164072\n", + "[14:16:04] Linear model: C = 0.0005 score = -0.3666276035478478\n", + "[14:16:10] Linear model: C = 0.001 score = -0.31130200709742806\n", + "[14:16:18] Linear model: C = 0.005 score = -0.2326339584928626\n", + "[14:16:25] Linear model: C = 0.01 score = -0.21658099282365262\n", + "[14:16:33] Linear model: C = 0.05 score = -0.21364841773406087\n", + "[14:16:42] Linear model: C = 0.1 score = -0.2256018292053085\n", + "[14:16:51] Linear model: C = 0.5 score = -0.2763179966937595\n", + "[14:16:51] ===== Start working with \u001b[1mfold 4\u001b[0m for \u001b[1mLvl_0_Pipe_0_Mod_0_LinearL2\u001b[0m =====\n", + "[14:16:58] Linear model: C = 1e-05 score = -0.9531496536787142\n", + "[14:17:05] Linear model: C = 5e-05 score = -0.6270339670737181\n", + "[14:17:10] Linear model: C = 0.0001 score = -0.517302736118502\n", + "[14:17:17] Linear model: C = 0.0005 score = -0.331531311465719\n", + "[14:17:23] Linear model: C = 0.001 score = -0.27798570249468424\n", + "[14:17:32] Linear model: C = 0.005 score = -0.20448637290477473\n", + "[14:17:39] Linear model: C = 0.01 score = -0.19081673660070902\n", + "[14:17:47] Linear model: C = 0.05 score = -0.1923892363102242\n", + "[14:17:56] Linear model: C = 0.1 score = -0.20661581389305533\n", + "[14:17:56] Fitting \u001b[1mLvl_0_Pipe_0_Mod_0_LinearL2\u001b[0m finished. score = \u001b[1m-0.21831477243925082\u001b[0m\n", + "[14:17:56] \u001b[1mLvl_0_Pipe_0_Mod_0_LinearL2\u001b[0m fitting and predicting completed\n", + "[14:17:56] Time left 17195.98 secs\n", "\n", - "[10:29:05] Start fitting Lvl_0_Pipe_1_Mod_0_CatBoost ...\n", - "[10:29:05] ===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_0_CatBoost =====\n", - "[10:29:06] 0:\tlearn: 2.2635128\ttest: 2.2654787\tbest: 2.2654787 (0)\ttotal: 8.13ms\tremaining: 32.5s\n", - "[10:29:28] bestTest = 0.2448323674\n", - "[10:29:28] bestIteration = 3999\n", - "[10:29:29] ===== Start working with fold 1 for Lvl_0_Pipe_1_Mod_0_CatBoost =====\n", - "[10:29:30] 0:\tlearn: 2.2645175\ttest: 2.2638240\tbest: 2.2638240 (0)\ttotal: 7.8ms\tremaining: 31.2s\n", - "[10:29:52] bestTest = 0.2655982428\n", - "[10:29:52] bestIteration = 3998\n", - "[10:29:52] Shrink model to first 3999 iterations.\n", - "[10:29:53] ===== Start working with fold 2 for Lvl_0_Pipe_1_Mod_0_CatBoost =====\n", - "[10:29:53] 0:\tlearn: 2.2638053\ttest: 2.2658397\tbest: 2.2658397 (0)\ttotal: 7.9ms\tremaining: 31.6s\n", - "[10:30:15] bestTest = 0.2736756787\n", - "[10:30:15] bestIteration = 3999\n", - "[10:30:16] ===== Start working with fold 3 for Lvl_0_Pipe_1_Mod_0_CatBoost =====\n", - "[10:30:17] 0:\tlearn: 2.2646526\ttest: 2.2635595\tbest: 2.2635595 (0)\ttotal: 7.37ms\tremaining: 29.5s\n", - "[10:30:38] bestTest = 0.2742944825\n", - "[10:30:38] bestIteration = 3998\n", - "[10:30:38] Shrink model to first 3999 iterations.\n", - "[10:30:39] ===== Start working with fold 4 for Lvl_0_Pipe_1_Mod_0_CatBoost =====\n", - "[10:30:40] 0:\tlearn: 2.2639121\ttest: 2.2648622\tbest: 2.2648622 (0)\ttotal: 7.03ms\tremaining: 28.1s\n", - "[10:31:01] bestTest = 0.2581136896\n", - "[10:31:01] bestIteration = 3998\n", - "[10:31:01] Shrink model to first 3999 iterations.\n", - "[10:31:02] Fitting Lvl_0_Pipe_1_Mod_0_CatBoost finished. score = -0.26330128259018876\n", - "[10:31:02] Lvl_0_Pipe_1_Mod_0_CatBoost fitting and predicting completed\n", - "[10:31:02] Time left 16947.02 secs\n", + "[14:22:15] Start fitting \u001b[1mLvl_0_Pipe_1_Mod_0_CatBoost\u001b[0m ...\n", + "[14:22:16] ===== Start working with \u001b[1mfold 0\u001b[0m for \u001b[1mLvl_0_Pipe_1_Mod_0_CatBoost\u001b[0m =====\n", + "[14:22:16] 0:\tlearn: 2.2636799\ttest: 2.2649649\tbest: 2.2649649 (0)\ttotal: 6.85ms\tremaining: 27.4s\n", + "[14:22:35] bestTest = 0.2436411292\n", + "[14:22:35] bestIteration = 3999\n", + "[14:22:35] ===== Start working with \u001b[1mfold 1\u001b[0m for \u001b[1mLvl_0_Pipe_1_Mod_0_CatBoost\u001b[0m =====\n", + "[14:22:36] 0:\tlearn: 2.2634692\ttest: 2.2632526\tbest: 2.2632526 (0)\ttotal: 6.16ms\tremaining: 24.6s\n", + "[14:22:55] bestTest = 0.2658199543\n", + "[14:22:55] bestIteration = 3999\n", + "[14:22:56] ===== Start working with \u001b[1mfold 2\u001b[0m for \u001b[1mLvl_0_Pipe_1_Mod_0_CatBoost\u001b[0m =====\n", + "[14:22:56] 0:\tlearn: 2.2631654\ttest: 2.2656298\tbest: 2.2656298 (0)\ttotal: 6.08ms\tremaining: 24.3s\n", + "[14:23:16] bestTest = 0.2753673319\n", + "[14:23:16] bestIteration = 3999\n", + "[14:23:16] ===== Start working with \u001b[1mfold 3\u001b[0m for \u001b[1mLvl_0_Pipe_1_Mod_0_CatBoost\u001b[0m =====\n", + "[14:23:17] 0:\tlearn: 2.2645696\ttest: 2.2657045\tbest: 2.2657045 (0)\ttotal: 6.76ms\tremaining: 27s\n", + "[14:23:37] bestTest = 0.2738943611\n", + "[14:23:37] bestIteration = 3996\n", + "[14:23:37] Shrink model to first 3997 iterations.\n", + "[14:23:37] ===== Start working with \u001b[1mfold 4\u001b[0m for \u001b[1mLvl_0_Pipe_1_Mod_0_CatBoost\u001b[0m =====\n", + "[14:23:38] 0:\tlearn: 2.2642805\ttest: 2.2644245\tbest: 2.2644245 (0)\ttotal: 5.84ms\tremaining: 23.4s\n", + "[14:23:57] bestTest = 0.2538460334\n", + "[14:23:57] bestIteration = 3999\n", + "[14:23:58] Fitting \u001b[1mLvl_0_Pipe_1_Mod_0_CatBoost\u001b[0m finished. score = \u001b[1m-0.2625123265864018\u001b[0m\n", + "[14:23:58] \u001b[1mLvl_0_Pipe_1_Mod_0_CatBoost\u001b[0m fitting and predicting completed\n", + "[14:23:58] Time left 16834.07 secs\n", "\n", - "[10:31:02] Layer 1 training completed.\n", + "[14:23:58] \u001b[1mLayer 1 training completed.\u001b[0m\n", "\n", - "[10:31:02] Blending: optimization starts with equal weights and score -0.2506653444869967\n", - "[10:31:03] Blending: iteration 0: score = -0.23574740438551683, weights = [0.21525846 0.7847415 ]\n", - "[10:31:03] Blending: iteration 1: score = -0.23574740438551683, weights = [0.21525846 0.7847415 ]\n", - "[10:31:03] Blending: no score update. Terminated\n", + "[14:23:58] Blending: optimization starts with equal weights and score \u001b[1m-0.1879588701291192\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dvladimirvasilyev/anaconda3/envs/myenv/lib/python3.8/site-packages/sklearn/metrics/_classification.py:2916: UserWarning: The y_pred values do not sum to one. Starting from 1.5 thiswill result in an error.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[14:23:59] Blending: iteration \u001b[1m0\u001b[0m: score = \u001b[1m-0.18573794844833624\u001b[0m, weights = \u001b[1m[0.63928086 0.36071914]\u001b[0m\n", + "[14:23:59] Blending: iteration \u001b[1m1\u001b[0m: score = \u001b[1m-0.18573794844833624\u001b[0m, weights = \u001b[1m[0.63928086 0.36071914]\u001b[0m\n", + "[14:23:59] Blending: no score update. Terminated\n", "\n", - "[10:31:03] Automl preset training completed in 1054.20 seconds\n", + "[14:23:59] \u001b[1mAutoml preset training completed in 1167.35 seconds\u001b[0m\n", "\n", - "[10:31:03] Model description:\n", + "[14:23:59] Model description:\n", "Final prediction for new objects (level 0) = \n", - "\t 0.21526 * (5 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +\n", - "\t 0.78474 * (5 averaged models Lvl_0_Pipe_1_Mod_0_CatBoost) \n", + "\t 0.63928 * (5 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +\n", + "\t 0.36072 * (5 averaged models Lvl_0_Pipe_1_Mod_0_CatBoost) \n", "\n", - "CPU times: user 25min 28s, sys: 1min 41s, total: 27min 9s\n", - "Wall time: 17min 34s\n" + "CPU times: user 18min 40s, sys: 3min 1s, total: 21min 42s\n", + "Wall time: 19min 27s\n" ] } ], @@ -1494,7 +1582,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1600,7 +1688,7 @@ "[114477 rows x 2 columns]" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1615,6 +1703,24 @@ "execution_count": 22, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_12895/1432655611.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " preds['pred_' + str(i)] = oof_pred.data[:,i]\n", + "/tmp/ipykernel_12895/1432655611.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " preds['pred_' + str(i)] = oof_pred.data[:,i]\n" + ] + }, { "data": { "text/html": [ @@ -1655,76 +1761,76 @@ " 0\n", " 100330.jpg\n", " bacterial_leaf_blight\n", - " 0.030123\n", - " 0.307897\n", - " 0.456500\n", - " 0.001812\n", - " 0.031111\n", - " 0.022188\n", - " 0.002502\n", - " 0.147808\n", - " 0.000054\n", - " 4.927851e-06\n", + " 0.023245\n", + " 0.315283\n", + " 0.470886\n", + " 0.002528\n", + " 0.021895\n", + " 0.007454\n", + " 0.001554\n", + " 0.157142\n", + " 8.914904e-06\n", + " 4.559626e-06\n", " \n", " \n", " 1\n", " 100365.jpg\n", " bacterial_leaf_blight\n", - " 0.002971\n", - " 0.121589\n", - " 0.027907\n", - " 0.000097\n", - " 0.014529\n", - " 0.002228\n", - " 0.000053\n", - " 0.830617\n", - " 0.000003\n", - " 5.009400e-06\n", + " 0.003717\n", + " 0.011035\n", + " 0.028317\n", + " 0.000110\n", + " 0.003178\n", + " 0.000015\n", + " 0.000131\n", + " 0.953496\n", + " 1.555987e-07\n", + " 5.692390e-07\n", " \n", " \n", " 2\n", " 100382.jpg\n", " bacterial_leaf_blight\n", - " 0.084655\n", - " 0.150933\n", - " 0.496689\n", - " 0.003890\n", - " 0.020583\n", - " 0.003894\n", - " 0.000731\n", - " 0.238610\n", - " 0.000010\n", - " 4.939010e-06\n", + " 0.025734\n", + " 0.095088\n", + " 0.208473\n", + " 0.000879\n", + " 0.007030\n", + " 0.003382\n", + " 0.000142\n", + " 0.659271\n", + " 3.872871e-07\n", + " 2.898941e-07\n", " \n", " \n", " 3\n", " 100632.jpg\n", " bacterial_leaf_blight\n", - " 0.008494\n", - " 0.691138\n", - " 0.045640\n", - " 0.000334\n", - " 0.090399\n", - " 0.000388\n", - " 0.000152\n", - " 0.163438\n", - " 0.000012\n", - " 5.071352e-06\n", + " 0.002876\n", + " 0.542942\n", + " 0.027466\n", + " 0.000317\n", + " 0.036005\n", + " 0.000398\n", + " 0.000082\n", + " 0.389901\n", + " 3.837710e-06\n", + " 9.339438e-06\n", " \n", " \n", " 4\n", " 101918.jpg\n", " bacterial_leaf_blight\n", - " 0.025846\n", - " 0.215757\n", - " 0.033691\n", - " 0.000158\n", - " 0.034468\n", - " 0.000259\n", - " 0.000074\n", - " 0.689742\n", - " 0.000001\n", - " 1.504601e-06\n", + " 0.009988\n", + " 0.033572\n", + " 0.017635\n", + " 0.000032\n", + " 0.008310\n", + " 0.000136\n", + " 0.000041\n", + " 0.930286\n", + " 1.554736e-07\n", + " 1.530466e-07\n", " \n", " \n", " ...\n", @@ -1745,76 +1851,76 @@ " 114472\n", " 110381.jpg\n", " tungro\n", - " 0.011443\n", - " 0.210518\n", - " 0.011583\n", - " 0.005677\n", - " 0.737980\n", - " 0.000237\n", - " 0.020141\n", - " 0.002203\n", - " 0.000218\n", - " 3.771045e-07\n", + " 0.001716\n", + " 0.109143\n", + " 0.020722\n", + " 0.001495\n", + " 0.845324\n", + " 0.000177\n", + " 0.021384\n", + " 0.000027\n", + " 6.304998e-06\n", + " 6.075803e-06\n", " \n", " \n", " 114473\n", " 110381.jpg\n", " tungro\n", - " 0.010602\n", - " 0.037910\n", - " 0.013473\n", - " 0.005240\n", - " 0.911409\n", - " 0.000039\n", - " 0.021000\n", - " 0.000297\n", - " 0.000029\n", - " 7.560920e-07\n", + " 0.022644\n", + " 0.137650\n", + " 0.026389\n", + " 0.004165\n", + " 0.788036\n", + " 0.001093\n", + " 0.019688\n", + " 0.000259\n", + " 3.142513e-05\n", + " 4.477663e-05\n", " \n", " \n", " 114474\n", " 110381.jpg\n", " tungro\n", - " 0.039948\n", - " 0.047678\n", - " 0.024860\n", - " 0.010115\n", - " 0.843555\n", - " 0.000212\n", - " 0.032901\n", - " 0.000302\n", - " 0.000428\n", - " 5.207394e-07\n", + " 0.016897\n", + " 0.072329\n", + " 0.010469\n", + " 0.005554\n", + " 0.789777\n", + " 0.001240\n", + " 0.103631\n", + " 0.000060\n", + " 1.301366e-05\n", + " 2.972130e-05\n", " \n", " \n", " 114475\n", " 110381.jpg\n", " tungro\n", - " 0.011752\n", - " 0.048414\n", - " 0.004618\n", - " 0.003911\n", - " 0.907945\n", - " 0.000063\n", - " 0.022485\n", - " 0.000527\n", - " 0.000281\n", - " 2.993103e-06\n", + " 0.008637\n", + " 0.114299\n", + " 0.082281\n", + " 0.003465\n", + " 0.560001\n", + " 0.000741\n", + " 0.230260\n", + " 0.000112\n", + " 1.909918e-04\n", + " 1.351225e-05\n", " \n", " \n", " 114476\n", " 110381.jpg\n", " tungro\n", - " 0.011532\n", - " 0.135282\n", - " 0.031151\n", - " 0.007868\n", - " 0.753411\n", - " 0.000106\n", - " 0.059282\n", - " 0.001330\n", - " 0.000034\n", - " 4.079704e-06\n", + " 0.004179\n", + " 0.099988\n", + " 0.008320\n", + " 0.004660\n", + " 0.822037\n", + " 0.000663\n", + " 0.059627\n", + " 0.000318\n", + " 1.922170e-04\n", + " 1.441010e-05\n", " \n", " \n", "\n", @@ -1823,43 +1929,43 @@ ], "text/plain": [ " image_id label pred_0 pred_1 pred_2 \\\n", - "0 100330.jpg bacterial_leaf_blight 0.030123 0.307897 0.456500 \n", - "1 100365.jpg bacterial_leaf_blight 0.002971 0.121589 0.027907 \n", - "2 100382.jpg bacterial_leaf_blight 0.084655 0.150933 0.496689 \n", - "3 100632.jpg bacterial_leaf_blight 0.008494 0.691138 0.045640 \n", - "4 101918.jpg bacterial_leaf_blight 0.025846 0.215757 0.033691 \n", + "0 100330.jpg bacterial_leaf_blight 0.023245 0.315283 0.470886 \n", + "1 100365.jpg bacterial_leaf_blight 0.003717 0.011035 0.028317 \n", + "2 100382.jpg bacterial_leaf_blight 0.025734 0.095088 0.208473 \n", + "3 100632.jpg bacterial_leaf_blight 0.002876 0.542942 0.027466 \n", + "4 101918.jpg bacterial_leaf_blight 0.009988 0.033572 0.017635 \n", "... ... ... ... ... ... \n", - "114472 110381.jpg tungro 0.011443 0.210518 0.011583 \n", - "114473 110381.jpg tungro 0.010602 0.037910 0.013473 \n", - "114474 110381.jpg tungro 0.039948 0.047678 0.024860 \n", - "114475 110381.jpg tungro 0.011752 0.048414 0.004618 \n", - "114476 110381.jpg tungro 0.011532 0.135282 0.031151 \n", + "114472 110381.jpg tungro 0.001716 0.109143 0.020722 \n", + "114473 110381.jpg tungro 0.022644 0.137650 0.026389 \n", + "114474 110381.jpg tungro 0.016897 0.072329 0.010469 \n", + "114475 110381.jpg tungro 0.008637 0.114299 0.082281 \n", + "114476 110381.jpg tungro 0.004179 0.099988 0.008320 \n", "\n", - " pred_3 pred_4 pred_5 pred_6 pred_7 pred_8 \\\n", - "0 0.001812 0.031111 0.022188 0.002502 0.147808 0.000054 \n", - "1 0.000097 0.014529 0.002228 0.000053 0.830617 0.000003 \n", - "2 0.003890 0.020583 0.003894 0.000731 0.238610 0.000010 \n", - "3 0.000334 0.090399 0.000388 0.000152 0.163438 0.000012 \n", - "4 0.000158 0.034468 0.000259 0.000074 0.689742 0.000001 \n", - "... ... ... ... ... ... ... \n", - "114472 0.005677 0.737980 0.000237 0.020141 0.002203 0.000218 \n", - "114473 0.005240 0.911409 0.000039 0.021000 0.000297 0.000029 \n", - "114474 0.010115 0.843555 0.000212 0.032901 0.000302 0.000428 \n", - "114475 0.003911 0.907945 0.000063 0.022485 0.000527 0.000281 \n", - "114476 0.007868 0.753411 0.000106 0.059282 0.001330 0.000034 \n", + " pred_3 pred_4 pred_5 pred_6 pred_7 pred_8 \\\n", + "0 0.002528 0.021895 0.007454 0.001554 0.157142 8.914904e-06 \n", + "1 0.000110 0.003178 0.000015 0.000131 0.953496 1.555987e-07 \n", + "2 0.000879 0.007030 0.003382 0.000142 0.659271 3.872871e-07 \n", + "3 0.000317 0.036005 0.000398 0.000082 0.389901 3.837710e-06 \n", + "4 0.000032 0.008310 0.000136 0.000041 0.930286 1.554736e-07 \n", + "... ... ... ... ... ... ... \n", + "114472 0.001495 0.845324 0.000177 0.021384 0.000027 6.304998e-06 \n", + "114473 0.004165 0.788036 0.001093 0.019688 0.000259 3.142513e-05 \n", + "114474 0.005554 0.789777 0.001240 0.103631 0.000060 1.301366e-05 \n", + "114475 0.003465 0.560001 0.000741 0.230260 0.000112 1.909918e-04 \n", + "114476 0.004660 0.822037 0.000663 0.059627 0.000318 1.922170e-04 \n", "\n", " pred_9 \n", - "0 4.927851e-06 \n", - "1 5.009400e-06 \n", - "2 4.939010e-06 \n", - "3 5.071352e-06 \n", - "4 1.504601e-06 \n", + "0 4.559626e-06 \n", + "1 5.692390e-07 \n", + "2 2.898941e-07 \n", + "3 9.339438e-06 \n", + "4 1.530466e-07 \n", "... ... \n", - "114472 3.771045e-07 \n", - "114473 7.560920e-07 \n", - "114474 5.207394e-07 \n", - "114475 2.993103e-06 \n", - "114476 4.079704e-06 \n", + "114472 6.075803e-06 \n", + "114473 4.477663e-05 \n", + "114474 2.972130e-05 \n", + "114475 1.351225e-05 \n", + "114476 1.441010e-05 \n", "\n", "[114477 rows x 12 columns]" ] @@ -1928,76 +2034,76 @@ " 0\n", " 100001.jpg\n", " brown_spot\n", - " 0.003971\n", - " 0.001296\n", - " 0.003271\n", - " 1.106504e-02\n", - " 0.007749\n", - " 0.966667\n", - " 0.005006\n", - " 0.000583\n", - " 0.000050\n", - " 3.405732e-04\n", + " 0.001334\n", + " 0.000791\n", + " 0.002372\n", + " 5.432664e-03\n", + " 0.005328\n", + " 0.978495\n", + " 0.002519\n", + " 0.003511\n", + " 7.897679e-05\n", + " 1.378119e-04\n", " \n", " \n", " 1\n", " 100002.jpg\n", " normal\n", - " 0.898224\n", - " 0.038288\n", - " 0.016787\n", - " 2.553555e-02\n", - " 0.011513\n", - " 0.003794\n", - " 0.001537\n", - " 0.003510\n", - " 0.000794\n", - " 1.736730e-05\n", + " 0.978428\n", + " 0.011744\n", + " 0.001621\n", + " 3.187062e-03\n", + " 0.002579\n", + " 0.000282\n", + " 0.000156\n", + " 0.001969\n", + " 3.391063e-05\n", + " 1.971700e-07\n", " \n", " \n", " 2\n", " 100003.jpg\n", " hispa\n", - " 0.024842\n", - " 0.001781\n", - " 0.971629\n", - " 9.731490e-08\n", - " 0.000082\n", - " 0.000143\n", - " 0.000401\n", - " 0.001095\n", - " 0.000008\n", - " 1.762646e-05\n", + " 0.004639\n", + " 0.002192\n", + " 0.992883\n", + " 1.573081e-07\n", + " 0.000026\n", + " 0.000037\n", + " 0.000005\n", + " 0.000218\n", + " 1.920397e-07\n", + " 1.528186e-07\n", " \n", " \n", " 3\n", " 100004.jpg\n", " blast\n", - " 0.000396\n", - " 0.976271\n", - " 0.003184\n", - " 1.529731e-02\n", - " 0.002230\n", - " 0.002466\n", - " 0.000006\n", - " 0.000060\n", - " 0.000089\n", - " 5.999530e-07\n", + " 0.000259\n", + " 0.982406\n", + " 0.004401\n", + " 7.787708e-03\n", + " 0.002372\n", + " 0.002163\n", + " 0.000173\n", + " 0.000115\n", + " 3.223106e-04\n", + " 4.848040e-07\n", " \n", " \n", " 4\n", " 100005.jpg\n", " hispa\n", - " 0.040845\n", - " 0.017258\n", - " 0.914247\n", - " 6.972055e-05\n", - " 0.011210\n", - " 0.012487\n", - " 0.001175\n", - " 0.000358\n", - " 0.002349\n", - " 2.259109e-06\n", + " 0.010951\n", + " 0.047475\n", + " 0.829855\n", + " 1.200308e-05\n", + " 0.091933\n", + " 0.000418\n", + " 0.018967\n", + " 0.000370\n", + " 1.118553e-05\n", + " 8.759866e-06\n", " \n", " \n", " ...\n", @@ -2018,76 +2124,76 @@ " 10402\n", " 110403.jpg\n", " tungro\n", - " 0.004226\n", - " 0.011191\n", - " 0.024608\n", - " 7.129314e-03\n", - " 0.939744\n", - " 0.002247\n", - " 0.006578\n", - " 0.004179\n", - " 0.000066\n", - " 3.340825e-05\n", + " 0.001664\n", + " 0.002167\n", + " 0.007366\n", + " 4.507852e-03\n", + " 0.981122\n", + " 0.000052\n", + " 0.001666\n", + " 0.001455\n", + " 1.527430e-07\n", + " 3.928369e-07\n", " \n", " \n", " 10403\n", " 110404.jpg\n", " normal\n", - " 0.894465\n", - " 0.001002\n", - " 0.083502\n", - " 1.798824e-05\n", - " 0.007053\n", - " 0.000746\n", - " 0.012884\n", - " 0.000325\n", - " 0.000003\n", - " 4.647352e-07\n", + " 0.932484\n", + " 0.002359\n", + " 0.049850\n", + " 1.244102e-05\n", + " 0.011696\n", + " 0.000593\n", + " 0.002646\n", + " 0.000304\n", + " 4.828784e-05\n", + " 7.773816e-06\n", " \n", " \n", " 10404\n", " 110405.jpg\n", " dead_heart\n", - " 0.000375\n", - " 0.000232\n", - " 0.000546\n", - " 9.984713e-01\n", - " 0.000006\n", - " 0.000077\n", - " 0.000213\n", - " 0.000016\n", - " 0.000009\n", - " 5.422648e-05\n", + " 0.000192\n", + " 0.000044\n", + " 0.000152\n", + " 9.994839e-01\n", + " 0.000001\n", + " 0.000025\n", + " 0.000058\n", + " 0.000003\n", + " 1.957294e-06\n", + " 3.789358e-05\n", " \n", " \n", " 10405\n", " 110406.jpg\n", " blast\n", - " 0.000328\n", - " 0.957169\n", - " 0.000155\n", - " 2.832647e-02\n", - " 0.002730\n", - " 0.002962\n", - " 0.000273\n", - " 0.003520\n", - " 0.000160\n", - " 4.376236e-03\n", + " 0.000226\n", + " 0.977683\n", + " 0.000268\n", + " 9.254745e-03\n", + " 0.004962\n", + " 0.000595\n", + " 0.004523\n", + " 0.001717\n", + " 5.624577e-04\n", + " 2.080105e-04\n", " \n", " \n", " 10406\n", " 110407.jpg\n", " brown_spot\n", - " 0.000017\n", - " 0.000416\n", - " 0.001281\n", - " 6.053330e-04\n", - " 0.000525\n", - " 0.996624\n", - " 0.000190\n", - " 0.000081\n", - " 0.000190\n", - " 7.008029e-05\n", + " 0.000009\n", + " 0.000188\n", + " 0.000539\n", + " 4.357956e-04\n", + " 0.000232\n", + " 0.997215\n", + " 0.000039\n", + " 0.000010\n", + " 1.319862e-03\n", + " 1.372061e-05\n", " \n", " \n", "\n", @@ -2096,30 +2202,30 @@ ], "text/plain": [ " image_id label pred_0 pred_1 pred_2 pred_3 \\\n", - "0 100001.jpg brown_spot 0.003971 0.001296 0.003271 1.106504e-02 \n", - "1 100002.jpg normal 0.898224 0.038288 0.016787 2.553555e-02 \n", - "2 100003.jpg hispa 0.024842 0.001781 0.971629 9.731490e-08 \n", - "3 100004.jpg blast 0.000396 0.976271 0.003184 1.529731e-02 \n", - "4 100005.jpg hispa 0.040845 0.017258 0.914247 6.972055e-05 \n", + "0 100001.jpg brown_spot 0.001334 0.000791 0.002372 5.432664e-03 \n", + "1 100002.jpg normal 0.978428 0.011744 0.001621 3.187062e-03 \n", + "2 100003.jpg hispa 0.004639 0.002192 0.992883 1.573081e-07 \n", + "3 100004.jpg blast 0.000259 0.982406 0.004401 7.787708e-03 \n", + "4 100005.jpg hispa 0.010951 0.047475 0.829855 1.200308e-05 \n", "... ... ... ... ... ... ... \n", - "10402 110403.jpg tungro 0.004226 0.011191 0.024608 7.129314e-03 \n", - "10403 110404.jpg normal 0.894465 0.001002 0.083502 1.798824e-05 \n", - "10404 110405.jpg dead_heart 0.000375 0.000232 0.000546 9.984713e-01 \n", - "10405 110406.jpg blast 0.000328 0.957169 0.000155 2.832647e-02 \n", - "10406 110407.jpg brown_spot 0.000017 0.000416 0.001281 6.053330e-04 \n", + "10402 110403.jpg tungro 0.001664 0.002167 0.007366 4.507852e-03 \n", + "10403 110404.jpg normal 0.932484 0.002359 0.049850 1.244102e-05 \n", + "10404 110405.jpg dead_heart 0.000192 0.000044 0.000152 9.994839e-01 \n", + "10405 110406.jpg blast 0.000226 0.977683 0.000268 9.254745e-03 \n", + "10406 110407.jpg brown_spot 0.000009 0.000188 0.000539 4.357956e-04 \n", "\n", - " pred_4 pred_5 pred_6 pred_7 pred_8 pred_9 \n", - "0 0.007749 0.966667 0.005006 0.000583 0.000050 3.405732e-04 \n", - "1 0.011513 0.003794 0.001537 0.003510 0.000794 1.736730e-05 \n", - "2 0.000082 0.000143 0.000401 0.001095 0.000008 1.762646e-05 \n", - "3 0.002230 0.002466 0.000006 0.000060 0.000089 5.999530e-07 \n", - "4 0.011210 0.012487 0.001175 0.000358 0.002349 2.259109e-06 \n", - "... ... ... ... ... ... ... \n", - "10402 0.939744 0.002247 0.006578 0.004179 0.000066 3.340825e-05 \n", - "10403 0.007053 0.000746 0.012884 0.000325 0.000003 4.647352e-07 \n", - "10404 0.000006 0.000077 0.000213 0.000016 0.000009 5.422648e-05 \n", - "10405 0.002730 0.002962 0.000273 0.003520 0.000160 4.376236e-03 \n", - "10406 0.000525 0.996624 0.000190 0.000081 0.000190 7.008029e-05 \n", + " pred_4 pred_5 pred_6 pred_7 pred_8 pred_9 \n", + "0 0.005328 0.978495 0.002519 0.003511 7.897679e-05 1.378119e-04 \n", + "1 0.002579 0.000282 0.000156 0.001969 3.391063e-05 1.971700e-07 \n", + "2 0.000026 0.000037 0.000005 0.000218 1.920397e-07 1.528186e-07 \n", + "3 0.002372 0.002163 0.000173 0.000115 3.223106e-04 4.848040e-07 \n", + "4 0.091933 0.000418 0.018967 0.000370 1.118553e-05 8.759866e-06 \n", + "... ... ... ... ... ... ... \n", + "10402 0.981122 0.000052 0.001666 0.001455 1.527430e-07 3.928369e-07 \n", + "10403 0.011696 0.000593 0.002646 0.000304 4.828784e-05 7.773816e-06 \n", + "10404 0.000001 0.000025 0.000058 0.000003 1.957294e-06 3.789358e-05 \n", + "10405 0.004962 0.000595 0.004523 0.001717 5.624577e-04 2.080105e-04 \n", + "10406 0.000232 0.997215 0.000039 0.000010 1.319862e-03 1.372061e-05 \n", "\n", "[10407 rows x 12 columns]" ] @@ -2178,7 +2284,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Out-of-fold accuracy: 0.9436917459402325\n" + "Out-of-fold accuracy: 0.9686749303353512\n" ] } ], @@ -2201,14 +2307,12 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" } ], @@ -2260,29 +2364,35 @@ } }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 163/163 [01:28<00:00, 1.84it/s]\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "[10:33:05] Load saved dataset for path\n", - "[10:33:05] Feature path transformed\n", + "[14:28:22] Feature path transformed\n", "Prediction for te_data:\n", - "array([[3.73373814e-02, 7.55886547e-03, 3.31766725e-01, ...,\n", - " 2.03554723e-02, 5.54920807e-05, 2.34885629e-05],\n", - " [9.47430134e-01, 1.18461961e-03, 3.56146842e-02, ...,\n", - " 2.40112818e-03, 1.31623001e-05, 1.04800677e-06],\n", - " [2.86391020e-01, 5.00728607e-01, 7.27913678e-02, ...,\n", - " 6.01911684e-04, 8.56203333e-05, 4.92153486e-05],\n", + "array([[1.57098308e-01, 2.81519257e-03, 5.96348643e-01, ...,\n", + " 1.08084995e-02, 1.95845146e-07, 1.42198633e-05],\n", + " [9.83384371e-01, 6.52049668e-04, 1.45791359e-02, ...,\n", + " 1.12365209e-03, 9.75986836e-07, 1.95965598e-07],\n", + " [1.68020770e-01, 3.79674375e-01, 1.86414778e-01, ...,\n", + " 1.67078048e-03, 1.21877249e-03, 3.75247910e-03],\n", " ...,\n", - " [1.48009066e-03, 1.05104391e-05, 1.81454215e-02, ...,\n", - " 1.07178465e-04, 4.83141348e-12, 1.84408755e-08],\n", - " [1.09512859e-03, 9.73159331e-06, 2.51230318e-02, ...,\n", - " 1.13380796e-04, 3.83674342e-11, 8.84786147e-08],\n", - " [6.47179200e-04, 1.02468675e-05, 1.26451282e-02, ...,\n", - " 4.21979857e-05, 4.80127786e-12, 1.27486942e-07]], dtype=float32)\n", + " [1.05072348e-03, 1.24680300e-05, 5.70231769e-03, ...,\n", + " 4.37476301e-05, 1.52421890e-07, 1.81421214e-07],\n", + " [6.52685121e-04, 4.47798493e-06, 5.04824053e-03, ...,\n", + " 2.13344283e-05, 1.52417726e-07, 1.62638599e-07],\n", + " [1.57185504e-03, 1.01540554e-05, 2.53849756e-02, ...,\n", + " 1.17763964e-04, 1.52426963e-07, 1.77946404e-07]], dtype=float32)\n", "Shape = (20814, 10)\n", - "CPU times: user 5.91 s, sys: 572 ms, total: 6.48 s\n", - "Wall time: 58.2 s\n" + "CPU times: user 55.8 s, sys: 21.6 s, total: 1min 17s\n", + "Wall time: 2min 19s\n" ] } ], @@ -2295,9 +2405,21 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_12895/1185757098.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " sub['pred_' + str(i)] = te_pred.data[:,i]\n" + ] + }, { "data": { "text/html": [ @@ -2336,72 +2458,72 @@ " \n", " 0\n", " 200001.jpg\n", - " 0.037337\n", - " 0.007559\n", - " 0.331767\n", - " 0.036282\n", - " 0.215647\n", - " 0.347311\n", - " 0.003662\n", - " 0.020355\n", - " 5.549208e-05\n", - " 2.348856e-05\n", + " 0.157098\n", + " 0.002815\n", + " 0.596349\n", + " 0.020590\n", + " 1.148577e-01\n", + " 0.095614\n", + " 0.001854\n", + " 0.010808\n", + " 1.958451e-07\n", + " 1.421986e-05\n", " \n", " \n", " 1\n", " 200002.jpg\n", - " 0.947430\n", - " 0.001185\n", - " 0.035615\n", - " 0.000533\n", - " 0.012389\n", - " 0.000408\n", - " 0.000025\n", - " 0.002401\n", - " 1.316230e-05\n", - " 1.048007e-06\n", + " 0.983384\n", + " 0.000652\n", + " 0.014579\n", + " 0.000139\n", + " 6.825896e-05\n", + " 0.000044\n", + " 0.000008\n", + " 0.001124\n", + " 9.759868e-07\n", + " 1.959656e-07\n", " \n", " \n", " 2\n", " 200003.jpg\n", - " 0.286391\n", - " 0.500729\n", - " 0.072791\n", - " 0.000385\n", - " 0.000191\n", - " 0.040129\n", - " 0.098647\n", - " 0.000602\n", - " 8.562033e-05\n", - " 4.921535e-05\n", + " 0.168021\n", + " 0.379674\n", + " 0.186415\n", + " 0.000225\n", + " 1.850213e-03\n", + " 0.036919\n", + " 0.220253\n", + " 0.001671\n", + " 1.218772e-03\n", + " 3.752479e-03\n", " \n", " \n", " 3\n", " 200004.jpg\n", - " 0.000020\n", - " 0.978054\n", - " 0.017930\n", - " 0.000190\n", - " 0.000370\n", - " 0.000252\n", - " 0.000518\n", - " 0.001085\n", - " 1.576396e-03\n", - " 4.549823e-06\n", + " 0.000013\n", + " 0.990730\n", + " 0.008530\n", + " 0.000097\n", + " 1.116415e-04\n", + " 0.000215\n", + " 0.000111\n", + " 0.000037\n", + " 1.548404e-04\n", + " 1.946677e-07\n", " \n", " \n", " 4\n", " 200005.jpg\n", - " 0.000421\n", - " 0.996886\n", - " 0.002095\n", - " 0.000037\n", - " 0.000001\n", - " 0.000042\n", - " 0.000015\n", - " 0.000387\n", - " 4.106595e-07\n", - " 1.145299e-04\n", + " 0.000340\n", + " 0.999536\n", + " 0.000031\n", + " 0.000002\n", + " 6.857538e-07\n", + " 0.000003\n", + " 0.000007\n", + " 0.000029\n", + " 5.404088e-07\n", + " 4.985940e-05\n", " \n", " \n", " ...\n", @@ -2420,72 +2542,72 @@ " \n", " 20809\n", " 203469.jpg\n", - " 0.003314\n", - " 0.000015\n", - " 0.080585\n", - " 0.898532\n", - " 0.000060\n", - " 0.017272\n", - " 0.000069\n", - " 0.000152\n", - " 1.868006e-10\n", - " 5.368335e-07\n", + " 0.003061\n", + " 0.000017\n", + " 0.041731\n", + " 0.943745\n", + " 1.648944e-04\n", + " 0.010877\n", + " 0.000146\n", + " 0.000258\n", + " 1.524480e-07\n", + " 2.509265e-07\n", " \n", " \n", " 20810\n", " 203469.jpg\n", - " 0.001965\n", - " 0.000014\n", - " 0.012343\n", - " 0.981738\n", - " 0.000065\n", - " 0.003774\n", - " 0.000024\n", - " 0.000076\n", - " 1.609716e-11\n", - " 4.625642e-08\n", + " 0.000430\n", + " 0.000003\n", + " 0.002508\n", + " 0.993409\n", + " 2.613632e-05\n", + " 0.003580\n", + " 0.000007\n", + " 0.000036\n", + " 1.524176e-07\n", + " 1.595918e-07\n", " \n", " \n", " 20811\n", " 203469.jpg\n", - " 0.001480\n", - " 0.000011\n", - " 0.018145\n", - " 0.966826\n", - " 0.000135\n", - " 0.013271\n", - " 0.000024\n", - " 0.000107\n", - " 4.831413e-12\n", - " 1.844088e-08\n", + " 0.001051\n", + " 0.000012\n", + " 0.005702\n", + " 0.989972\n", + " 5.734707e-05\n", + " 0.003144\n", + " 0.000018\n", + " 0.000044\n", + " 1.524219e-07\n", + " 1.814212e-07\n", " \n", " \n", " 20812\n", " 203469.jpg\n", - " 0.001095\n", - " 0.000010\n", - " 0.025123\n", - " 0.962685\n", - " 0.000090\n", - " 0.010848\n", - " 0.000036\n", - " 0.000113\n", - " 3.836743e-11\n", - " 8.847861e-08\n", + " 0.000653\n", + " 0.000004\n", + " 0.005048\n", + " 0.990724\n", + " 3.223727e-05\n", + " 0.003505\n", + " 0.000012\n", + " 0.000021\n", + " 1.524177e-07\n", + " 1.626386e-07\n", " \n", " \n", " 20813\n", " 203469.jpg\n", - " 0.000647\n", + " 0.001572\n", " 0.000010\n", - " 0.012645\n", - " 0.981308\n", - " 0.000062\n", - " 0.005263\n", - " 0.000022\n", - " 0.000042\n", - " 4.801278e-12\n", - " 1.274869e-07\n", + " 0.025385\n", + " 0.965282\n", + " 1.030424e-04\n", + " 0.007472\n", + " 0.000058\n", + " 0.000118\n", + " 1.524270e-07\n", + " 1.779464e-07\n", " \n", " \n", "\n", @@ -2493,36 +2615,36 @@ "" ], "text/plain": [ - " image_id pred_0 pred_1 pred_2 pred_3 pred_4 pred_5 \\\n", - "0 200001.jpg 0.037337 0.007559 0.331767 0.036282 0.215647 0.347311 \n", - "1 200002.jpg 0.947430 0.001185 0.035615 0.000533 0.012389 0.000408 \n", - "2 200003.jpg 0.286391 0.500729 0.072791 0.000385 0.000191 0.040129 \n", - "3 200004.jpg 0.000020 0.978054 0.017930 0.000190 0.000370 0.000252 \n", - "4 200005.jpg 0.000421 0.996886 0.002095 0.000037 0.000001 0.000042 \n", - "... ... ... ... ... ... ... ... \n", - "20809 203469.jpg 0.003314 0.000015 0.080585 0.898532 0.000060 0.017272 \n", - "20810 203469.jpg 0.001965 0.000014 0.012343 0.981738 0.000065 0.003774 \n", - "20811 203469.jpg 0.001480 0.000011 0.018145 0.966826 0.000135 0.013271 \n", - "20812 203469.jpg 0.001095 0.000010 0.025123 0.962685 0.000090 0.010848 \n", - "20813 203469.jpg 0.000647 0.000010 0.012645 0.981308 0.000062 0.005263 \n", + " image_id pred_0 pred_1 pred_2 pred_3 pred_4 \\\n", + "0 200001.jpg 0.157098 0.002815 0.596349 0.020590 1.148577e-01 \n", + "1 200002.jpg 0.983384 0.000652 0.014579 0.000139 6.825896e-05 \n", + "2 200003.jpg 0.168021 0.379674 0.186415 0.000225 1.850213e-03 \n", + "3 200004.jpg 0.000013 0.990730 0.008530 0.000097 1.116415e-04 \n", + "4 200005.jpg 0.000340 0.999536 0.000031 0.000002 6.857538e-07 \n", + "... ... ... ... ... ... ... \n", + "20809 203469.jpg 0.003061 0.000017 0.041731 0.943745 1.648944e-04 \n", + "20810 203469.jpg 0.000430 0.000003 0.002508 0.993409 2.613632e-05 \n", + "20811 203469.jpg 0.001051 0.000012 0.005702 0.989972 5.734707e-05 \n", + "20812 203469.jpg 0.000653 0.000004 0.005048 0.990724 3.223727e-05 \n", + "20813 203469.jpg 0.001572 0.000010 0.025385 0.965282 1.030424e-04 \n", "\n", - " pred_6 pred_7 pred_8 pred_9 \n", - "0 0.003662 0.020355 5.549208e-05 2.348856e-05 \n", - "1 0.000025 0.002401 1.316230e-05 1.048007e-06 \n", - "2 0.098647 0.000602 8.562033e-05 4.921535e-05 \n", - "3 0.000518 0.001085 1.576396e-03 4.549823e-06 \n", - "4 0.000015 0.000387 4.106595e-07 1.145299e-04 \n", - "... ... ... ... ... \n", - "20809 0.000069 0.000152 1.868006e-10 5.368335e-07 \n", - "20810 0.000024 0.000076 1.609716e-11 4.625642e-08 \n", - "20811 0.000024 0.000107 4.831413e-12 1.844088e-08 \n", - "20812 0.000036 0.000113 3.836743e-11 8.847861e-08 \n", - "20813 0.000022 0.000042 4.801278e-12 1.274869e-07 \n", + " pred_5 pred_6 pred_7 pred_8 pred_9 \n", + "0 0.095614 0.001854 0.010808 1.958451e-07 1.421986e-05 \n", + "1 0.000044 0.000008 0.001124 9.759868e-07 1.959656e-07 \n", + "2 0.036919 0.220253 0.001671 1.218772e-03 3.752479e-03 \n", + "3 0.000215 0.000111 0.000037 1.548404e-04 1.946677e-07 \n", + "4 0.000003 0.000007 0.000029 5.404088e-07 4.985940e-05 \n", + "... ... ... ... ... ... \n", + "20809 0.010877 0.000146 0.000258 1.524480e-07 2.509265e-07 \n", + "20810 0.003580 0.000007 0.000036 1.524176e-07 1.595918e-07 \n", + "20811 0.003144 0.000018 0.000044 1.524219e-07 1.814212e-07 \n", + "20812 0.003505 0.000012 0.000021 1.524177e-07 1.626386e-07 \n", + "20813 0.007472 0.000058 0.000118 1.524270e-07 1.779464e-07 \n", "\n", "[20814 rows x 11 columns]" ] }, - "execution_count": 29, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -2537,7 +2659,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -2578,72 +2700,72 @@ " \n", " 0\n", " 200001.jpg\n", - " 0.053227\n", - " 0.003212\n", - " 0.419196\n", - " 3.920345e-02\n", - " 0.310807\n", - " 0.149547\n", - " 0.004056\n", - " 0.020704\n", - " 1.978658e-05\n", - " 2.894970e-05\n", + " 0.127650\n", + " 0.001409\n", + " 0.599914\n", + " 0.017568\n", + " 0.136898\n", + " 0.106915\n", + " 0.001796\n", + " 0.007829\n", + " 8.801418e-06\n", + " 1.216593e-05\n", " \n", " \n", " 1\n", " 200002.jpg\n", - " 0.952464\n", - " 0.001455\n", - " 0.031027\n", - " 4.967228e-04\n", - " 0.008271\n", - " 0.000645\n", - " 0.000039\n", - " 0.005577\n", - " 2.432690e-05\n", - " 9.184693e-07\n", + " 0.937035\n", + " 0.000638\n", + " 0.060420\n", + " 0.000098\n", + " 0.000105\n", + " 0.000096\n", + " 0.000016\n", + " 0.001586\n", + " 6.087082e-06\n", + " 2.249314e-07\n", " \n", " \n", " 2\n", " 200003.jpg\n", - " 0.246870\n", - " 0.569154\n", - " 0.080173\n", - " 7.125317e-04\n", - " 0.001712\n", - " 0.058631\n", - " 0.041982\n", - " 0.000260\n", - " 2.588407e-04\n", - " 2.464455e-04\n", + " 0.120163\n", + " 0.523312\n", + " 0.106169\n", + " 0.000473\n", + " 0.000748\n", + " 0.042688\n", + " 0.201373\n", + " 0.002807\n", + " 1.389023e-03\n", + " 8.788786e-04\n", " \n", " \n", " 3\n", " 200004.jpg\n", - " 0.000058\n", - " 0.959784\n", - " 0.011034\n", - " 2.430165e-03\n", - " 0.003499\n", - " 0.001969\n", - " 0.013944\n", - " 0.003807\n", - " 3.462628e-03\n", - " 1.142663e-05\n", + " 0.000020\n", + " 0.888623\n", + " 0.006415\n", + " 0.001150\n", + " 0.000430\n", + " 0.004390\n", + " 0.000616\n", + " 0.001799\n", + " 9.654120e-02\n", + " 1.466518e-05\n", " \n", " \n", " 4\n", " 200005.jpg\n", - " 0.001487\n", - " 0.977418\n", - " 0.008626\n", - " 2.508827e-05\n", - " 0.000008\n", - " 0.000428\n", - " 0.000075\n", - " 0.011624\n", - " 2.858276e-06\n", - " 3.070692e-04\n", + " 0.000680\n", + " 0.998898\n", + " 0.000085\n", + " 0.000009\n", + " 0.000001\n", + " 0.000002\n", + " 0.000021\n", + " 0.000172\n", + " 1.743805e-06\n", + " 1.304403e-04\n", " \n", " \n", " ...\n", @@ -2662,72 +2784,72 @@ " \n", " 3464\n", " 203465.jpg\n", - " 0.000320\n", - " 0.005188\n", - " 0.004152\n", - " 9.781365e-01\n", - " 0.005305\n", - " 0.001066\n", - " 0.002319\n", - " 0.000410\n", - " 2.963319e-03\n", - " 1.392240e-04\n", + " 0.000224\n", + " 0.002143\n", + " 0.001514\n", + " 0.990281\n", + " 0.002657\n", + " 0.000401\n", + " 0.001074\n", + " 0.000134\n", + " 1.530934e-03\n", + " 4.091801e-05\n", " \n", " \n", " 3465\n", " 203466.jpg\n", - " 0.265845\n", - " 0.012218\n", - " 0.721779\n", - " 9.995165e-07\n", - " 0.000029\n", - " 0.000084\n", - " 0.000012\n", - " 0.000009\n", - " 2.496185e-05\n", - " 1.246285e-07\n", + " 0.250769\n", + " 0.007148\n", + " 0.741840\n", + " 0.000002\n", + " 0.000022\n", + " 0.000013\n", + " 0.000076\n", + " 0.000129\n", + " 2.629060e-07\n", + " 2.120279e-07\n", " \n", " \n", " 3466\n", " 203467.jpg\n", - " 0.926839\n", - " 0.007073\n", - " 0.001850\n", - " 2.035212e-03\n", - " 0.029714\n", - " 0.015886\n", - " 0.000108\n", - " 0.016488\n", - " 2.400547e-07\n", - " 5.314636e-06\n", + " 0.960745\n", + " 0.004105\n", + " 0.001135\n", + " 0.000646\n", + " 0.016724\n", + " 0.008584\n", + " 0.000062\n", + " 0.007749\n", + " 2.438365e-04\n", + " 6.326832e-06\n", " \n", " \n", " 3467\n", " 203468.jpg\n", - " 0.008029\n", - " 0.002945\n", - " 0.025131\n", - " 3.647061e-05\n", - " 0.000677\n", - " 0.000099\n", - " 0.004348\n", - " 0.000237\n", - " 9.584755e-01\n", - " 2.236708e-05\n", + " 0.003675\n", + " 0.001097\n", + " 0.038018\n", + " 0.000038\n", + " 0.000483\n", + " 0.000310\n", + " 0.000223\n", + " 0.000208\n", + " 9.551883e-01\n", + " 7.596347e-04\n", " \n", " \n", " 3468\n", " 203469.jpg\n", - " 0.001868\n", - " 0.000015\n", - " 0.028723\n", - " 9.598277e-01\n", - " 0.000096\n", - " 0.009313\n", - " 0.000037\n", - " 0.000120\n", - " 5.089584e-11\n", - " 1.515332e-07\n", + " 0.001372\n", + " 0.000012\n", + " 0.015432\n", + " 0.977533\n", + " 0.000086\n", + " 0.005415\n", + " 0.000046\n", + " 0.000104\n", + " 1.524300e-07\n", + " 1.962799e-07\n", " \n", " \n", "\n", @@ -2735,36 +2857,36 @@ "" ], "text/plain": [ - " image_id pred_0 pred_1 pred_2 pred_3 pred_4 \\\n", - "0 200001.jpg 0.053227 0.003212 0.419196 3.920345e-02 0.310807 \n", - "1 200002.jpg 0.952464 0.001455 0.031027 4.967228e-04 0.008271 \n", - "2 200003.jpg 0.246870 0.569154 0.080173 7.125317e-04 0.001712 \n", - "3 200004.jpg 0.000058 0.959784 0.011034 2.430165e-03 0.003499 \n", - "4 200005.jpg 0.001487 0.977418 0.008626 2.508827e-05 0.000008 \n", - "... ... ... ... ... ... ... \n", - "3464 203465.jpg 0.000320 0.005188 0.004152 9.781365e-01 0.005305 \n", - "3465 203466.jpg 0.265845 0.012218 0.721779 9.995165e-07 0.000029 \n", - "3466 203467.jpg 0.926839 0.007073 0.001850 2.035212e-03 0.029714 \n", - "3467 203468.jpg 0.008029 0.002945 0.025131 3.647061e-05 0.000677 \n", - "3468 203469.jpg 0.001868 0.000015 0.028723 9.598277e-01 0.000096 \n", + " image_id pred_0 pred_1 pred_2 pred_3 pred_4 pred_5 \\\n", + "0 200001.jpg 0.127650 0.001409 0.599914 0.017568 0.136898 0.106915 \n", + "1 200002.jpg 0.937035 0.000638 0.060420 0.000098 0.000105 0.000096 \n", + "2 200003.jpg 0.120163 0.523312 0.106169 0.000473 0.000748 0.042688 \n", + "3 200004.jpg 0.000020 0.888623 0.006415 0.001150 0.000430 0.004390 \n", + "4 200005.jpg 0.000680 0.998898 0.000085 0.000009 0.000001 0.000002 \n", + "... ... ... ... ... ... ... ... \n", + "3464 203465.jpg 0.000224 0.002143 0.001514 0.990281 0.002657 0.000401 \n", + "3465 203466.jpg 0.250769 0.007148 0.741840 0.000002 0.000022 0.000013 \n", + "3466 203467.jpg 0.960745 0.004105 0.001135 0.000646 0.016724 0.008584 \n", + "3467 203468.jpg 0.003675 0.001097 0.038018 0.000038 0.000483 0.000310 \n", + "3468 203469.jpg 0.001372 0.000012 0.015432 0.977533 0.000086 0.005415 \n", "\n", - " pred_5 pred_6 pred_7 pred_8 pred_9 \n", - "0 0.149547 0.004056 0.020704 1.978658e-05 2.894970e-05 \n", - "1 0.000645 0.000039 0.005577 2.432690e-05 9.184693e-07 \n", - "2 0.058631 0.041982 0.000260 2.588407e-04 2.464455e-04 \n", - "3 0.001969 0.013944 0.003807 3.462628e-03 1.142663e-05 \n", - "4 0.000428 0.000075 0.011624 2.858276e-06 3.070692e-04 \n", - "... ... ... ... ... ... \n", - "3464 0.001066 0.002319 0.000410 2.963319e-03 1.392240e-04 \n", - "3465 0.000084 0.000012 0.000009 2.496185e-05 1.246285e-07 \n", - "3466 0.015886 0.000108 0.016488 2.400547e-07 5.314636e-06 \n", - "3467 0.000099 0.004348 0.000237 9.584755e-01 2.236708e-05 \n", - "3468 0.009313 0.000037 0.000120 5.089584e-11 1.515332e-07 \n", + " pred_6 pred_7 pred_8 pred_9 \n", + "0 0.001796 0.007829 8.801418e-06 1.216593e-05 \n", + "1 0.000016 0.001586 6.087082e-06 2.249314e-07 \n", + "2 0.201373 0.002807 1.389023e-03 8.788786e-04 \n", + "3 0.000616 0.001799 9.654120e-02 1.466518e-05 \n", + "4 0.000021 0.000172 1.743805e-06 1.304403e-04 \n", + "... ... ... ... ... \n", + "3464 0.001074 0.000134 1.530934e-03 4.091801e-05 \n", + "3465 0.000076 0.000129 2.629060e-07 2.120279e-07 \n", + "3466 0.000062 0.007749 2.438365e-04 6.326832e-06 \n", + "3467 0.000223 0.000208 9.551883e-01 7.596347e-04 \n", + "3468 0.000046 0.000104 1.524300e-07 1.962799e-07 \n", "\n", "[3469 rows x 11 columns]" ] }, - "execution_count": 30, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -2776,7 +2898,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -2796,7 +2918,7 @@ "Length: 3469, dtype: object" ] }, - "execution_count": 31, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -2808,7 +2930,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "metadata": { "execution": { "iopub.execute_input": "2022-06-09T09:09:41.251065Z", @@ -2922,7 +3044,7 @@ "[3469 rows x 2 columns]" ] }, - "execution_count": 32, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -2996,7 +3118,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.17" }, "vscode": { "interpreter": { From a2680de221516d0795d4b90a3504f1093da67a81 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Thu, 3 Aug 2023 15:04:45 +0000 Subject: [PATCH 07/49] new example --- examples/tutorials/Tutorial_8_CV_preset.ipynb | 233 ++++++++++++++++++ 1 file changed, 233 insertions(+) diff --git a/examples/tutorials/Tutorial_8_CV_preset.ipynb b/examples/tutorials/Tutorial_8_CV_preset.ipynb index 8a946476..18f7c68a 100644 --- a/examples/tutorials/Tutorial_8_CV_preset.ipynb +++ b/examples/tutorials/Tutorial_8_CV_preset.ipynb @@ -3055,6 +3055,239 @@ "sub[['image_id', 'label']]" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "No we can choose another model from timm. So we will use resnet50.a1_in1k, by default it uses vit_base_patch16_224.augreg_in21k" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "automl = TabularCVAutoML(task = task,\n", + " timeout=5 * 3600,\n", + " autocv_features={\"embed_model\": 'timm/tf_efficientnetv2_b0.in1k'},\n", + " cpu_limit = 2,\n", + " reader_params = {'cv': 5, 'random_state': 42})" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[14:37:43] Stdout logging level is INFO3.\n", + "[14:37:43] Task: multiclass\n", + "\n", + "[14:37:43] Start automl preset with listed constraints:\n", + "[14:37:43] - time: 18000.00 seconds\n", + "[14:37:43] - CPU: 2 cores\n", + "[14:37:43] - memory: 16 GB\n", + "\n", + "[14:37:43] \u001b[1mTrain data shape: (114477, 5)\u001b[0m\n", + "\n", + "[14:37:43] Layer \u001b[1m1\u001b[0m train process start. Time left 17999.80 secs\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2795cb31118c42a8a3c0753468f54c4a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading model.safetensors: 0%| | 0.00/28.8M [00:00 Date: Mon, 7 Aug 2023 08:47:09 +0000 Subject: [PATCH 08/49] chnged autonlp params --- lightautoml/automl/presets/text_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightautoml/automl/presets/text_config.yml b/lightautoml/automl/presets/text_config.yml index 0d3b9d37..45886d1c 100755 --- a/lightautoml/automl/presets/text_config.yml +++ b/lightautoml/automl/presets/text_config.yml @@ -282,7 +282,7 @@ autonlp_params: # 'pooled_bert' - embeddings from pooled bert output # 'wat' - weighted average transformers # borep and random_lstm: https://arxiv.org/abs/1901.10444 - model_name: 'random_lstm_bert' + model_name: 'pooled_bert' # dict with params of random_lstm, bert_embedder, borep or wat # check corresponding classes for details From 81c444f6120196dc631dcbb57565461bac2bb784 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Thu, 10 Aug 2023 10:02:44 +0000 Subject: [PATCH 09/49] add autoint --- lightautoml/automl/presets/image_config.yml | 2 + lightautoml/automl/presets/tabular_config.yml | 8 +- lightautoml/automl/presets/tabular_presets.py | 2 + lightautoml/automl/presets/text_config.yml | 2 + .../automl/presets/time_series_config.yml | 2 + lightautoml/ml_algo/dl_model.py | 51 ++- .../torch_based/autoint/autoint_utils.py | 307 ++++++++++++++++ .../ml_algo/torch_based/autoint/ghost_norm.py | 79 ++++ lightautoml/ml_algo/torch_based/nn_models.py | 208 ++++++++++- lightautoml/text/nn_model.py | 337 +++++++++++++++++- 10 files changed, 964 insertions(+), 34 deletions(-) create mode 100644 lightautoml/ml_algo/torch_based/autoint/autoint_utils.py create mode 100644 lightautoml/ml_algo/torch_based/autoint/ghost_norm.py diff --git a/lightautoml/automl/presets/image_config.yml b/lightautoml/automl/presets/image_config.yml index b937ae83..01c04b30 100755 --- a/lightautoml/automl/presets/image_config.yml +++ b/lightautoml/automl/presets/image_config.yml @@ -242,6 +242,8 @@ nn_params: # Look for NN train params here. # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model model: denselight + # embedding_size if needed + embedding_size: 10 # use model with custom embeddings model_with_emb: false # tune custom network diff --git a/lightautoml/automl/presets/tabular_config.yml b/lightautoml/automl/presets/tabular_config.yml index 0560d046..691609f4 100755 --- a/lightautoml/automl/presets/tabular_config.yml +++ b/lightautoml/automl/presets/tabular_config.yml @@ -128,6 +128,8 @@ nn_params: # Look for NN train params here. # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model model: denselight + # embedding_size if needed + embedding_size: 10 # use model with custom embeddings model_with_emb: false # tune custom network @@ -146,7 +148,7 @@ nn_params: # add fc layer before model with certain dim num_init_features: null # activation function (str in torch.nn activation functions or custom nn.Module) - act_fun: ReLU + act_fun: LeakyReLU # add noise after dropout layer for more regularization use_noise: false # noise parameter @@ -154,7 +156,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 512, 512] + hidden_size: [512, 256, 128, 64] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block @@ -178,7 +180,7 @@ nn_params: # scheduler sch: ReduceLROnPlateau # params of ReduceLROnPlateau scheduler - scheduler_params: { 'patience': 5, 'factor': 0.5, 'min_lr': 0.00001 } + scheduler_params: {} #{ 'patience': 5, 'factor': 0.5, 'min_lr': 0.00001 } # using snapshot ensembles # https://arxiv.org/abs/1704.00109 is_snap: false diff --git a/lightautoml/automl/presets/tabular_presets.py b/lightautoml/automl/presets/tabular_presets.py index f4d1101d..cf4ba8fe 100755 --- a/lightautoml/automl/presets/tabular_presets.py +++ b/lightautoml/automl/presets/tabular_presets.py @@ -607,6 +607,8 @@ def create_automl(self, **fit_args): "linear_layer", "_linear_layer", "node", + "autoint", + "autoint_emb_v2", ] available_nn_models = available_nn_models + [x + "_tuned" for x in available_nn_models] nn_models = [ diff --git a/lightautoml/automl/presets/text_config.yml b/lightautoml/automl/presets/text_config.yml index 45886d1c..14d9c3f1 100755 --- a/lightautoml/automl/presets/text_config.yml +++ b/lightautoml/automl/presets/text_config.yml @@ -120,6 +120,8 @@ linear_l2_params: # params for NN model nn_params: + # embedding_size if needed + embedding_size: 10 # early stopping and scheduler use metric stop_by_metric: False random_state: 42 diff --git a/lightautoml/automl/presets/time_series_config.yml b/lightautoml/automl/presets/time_series_config.yml index 0b4e84b1..2e4cbdb7 100644 --- a/lightautoml/automl/presets/time_series_config.yml +++ b/lightautoml/automl/presets/time_series_config.yml @@ -132,6 +132,8 @@ nn_params: # Look for NN train params here. # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model model: denselight + # embedding_size if needed + embedding_size: 10 # use model with custom embeddings model_with_emb: false # tune custom network diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 8db9d7db..8b968f35 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -1,5 +1,6 @@ """Neural net for tabular datasets.""" + from lightautoml.utils.installation import __validate_extra_deps @@ -43,7 +44,7 @@ from ..ml_algo.base import TabularDataset from ..ml_algo.base import TabularMLAlgo from ..pipelines.utils import get_columns_by_role -from ..text.nn_model import CatEmbedder +from ..text.nn_model import CatEmbedder, DefaultEmbedding, DenseEmbedding, LinearEmbedding, BasicEmbedding from ..text.nn_model import ContEmbedder from ..text.nn_model import TextBert from ..text.nn_model import TorchUniversalModel @@ -63,6 +64,7 @@ from .torch_based.nn_models import LinearLayer from .torch_based.nn_models import ResNetModel from .torch_based.nn_models import _LinearLayer +from .torch_based.nn_models import AutoInt logger = logging.getLogger(__name__) @@ -76,6 +78,32 @@ "_linear_layer": _LinearLayer, "snn": SNN, "node": NODE, + "autoint": AutoInt, + "autoint_emb_v2": AutoInt, +} +cat_embedder_by_name = { + "denselight": CatEmbedder, + "dense": CatEmbedder, + "resnet": CatEmbedder, + "mlp": CatEmbedder, + "linear_layer": CatEmbedder, + "_linear_layer": CatEmbedder, + "snn": CatEmbedder, + "node": CatEmbedder, + "autoint": BasicEmbedding, + "autoint_emb_v2": DefaultEmbedding, +} +cont_embedder_params_by_name = { + "denselight": ContEmbedder, + "dense": ContEmbedder, + "resnet": ContEmbedder, + "mlp": ContEmbedder, + "linear_layer": ContEmbedder, + "_linear_layer": ContEmbedder, + "snn": ContEmbedder, + "node": ContEmbedder, + "autoint": LinearEmbedding, + "autoint_emb_v2": DenseEmbedding, } @@ -245,23 +273,29 @@ def _infer_params(self): if isinstance(params[p_name], str): params[p_name] = getattr(module, params[p_name]) + # params = self._select_params(params) model = Trainer( net=TorchUniversalModel if not params["model_with_emb"] else params["model"], net_params={ "task": self.task, - "cont_embedder": ContEmbedder if is_cont else None, + "cont_embedder": cont_embedder_params_by_name[params["model"]] if is_cont else None, "cont_params": { - "num_dims": params["cont_dim"], + "num_dims": params["num_dims"], "input_bn": params["input_bn"], + "device": params["device"], + "embedding_size": params["embedding_size"], } if is_cont else None, - "cat_embedder": CatEmbedder if is_cat else None, + "cat_embedder": cat_embedder_by_name[params["model"]] if is_cat else None, "cat_params": { + "cat_vc": params["cat_vc"], "cat_dims": params["cat_dims"], "emb_dropout": params["emb_dropout"], "emb_ratio": params["emb_ratio"], "max_emb_size": params["max_emb_size"], + "embedding_size": params["embedding_size"], + "device": params["device"], } if is_cat else None, @@ -350,6 +384,7 @@ def _init_params_on_input(self, train_valid_iterator) -> dict: # Cat_features are needed to be preprocessed with LE, where 0 = not known category valid = train_valid_iterator.get_validation_data() + cat_value_counts = [] for cat_feature in new_params["cat_features"]: num_unique_categories = ( max( @@ -358,18 +393,20 @@ def _init_params_on_input(self, train_valid_iterator) -> dict: ) + 1 ) + values, counts = np.unique(train_valid_iterator.train[:, cat_feature].data, return_counts=True) + cat_value_counts.append(dict(zip(values, counts))) cat_dims.append(num_unique_categories) new_params["cat_dims"] = cat_dims - + new_params["cat_vc"] = cat_value_counts new_params["cont_features"] = get_columns_by_role(train_valid_iterator.train, "Numeric") - new_params["cont_dim"] = len(new_params["cont_features"]) + new_params["num_dims"] = len(new_params["cont_features"]) new_params["text_features"] = get_columns_by_role(train_valid_iterator.train, "Text") new_params["bias"] = self.get_mean_target(target, task_name) if params["init_bias"] else None logger.debug(f'number of text features: {len(new_params["text_features"])} ') logger.debug(f'number of categorical features: {len(new_params["cat_features"])} ') - logger.debug(f'number of continuous features: {new_params["cont_dim"]} ') + logger.debug(f'number of continuous features: {new_params["num_dims"]} ') return new_params diff --git a/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py b/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py new file mode 100644 index 00000000..c14944f5 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py @@ -0,0 +1,307 @@ +"""PyTorch modules for the AutoInt model.""" +# Paper: https://arxiv.org/pdf/1810.11921v2.pdf +# Official implementation: https://github.com/DeepGraphLearning/RecommenderSystems + +from collections import namedtuple +from typing import Optional, Type, Union +import torch +from torch import nn, Tensor +from torch.nn import functional as F + + +EmbeddingInfo = namedtuple("EmbeddingInfo", ["num_fields", "output_size"]) +UniformEmbeddingInfo = namedtuple("EmbeddingInfo", ["num_fields", "embedding_size", "output_size"]) + +MODULE_INIT_DOC = """ +Parameters +---------- +output_size : int + number of final output values; i.e., number of targets for + regression or number of classes for classification +embedding_num : EmbeddingBase or None + initialized and fit embedding for numeric fields +embedding_cat : EmbeddingBase or None + initialized and fit embedding for categorical fields +embedding_l1_reg : float, optional + value for l1 regularization of embedding vectors; default is 0.0 +embedding_l2_reg : float, optional + value for l2 regularization of embedding vectors; default is 0.0 +{} +mlp_hidden_sizes : int or iterable of int, optional + sizes for the linear transformations between the MLP input and + the output size needed based on the target; default is (512, 256, 128, 64) +mlp_activation : subclass of torch.nn.Module (uninitialized), optional + default is nn.LeakyReLU +mlp_use_bn : boolean, optional + whether to use batch normalization between MLP linear layers; + default is True +mlp_bn_momentum : float, optional + only used if `mlp_use_bn` is True; default is 0.01 +mlp_ghost_batch : int or None, optional + only used if `mlp_use_bn` is True; size of batch in "ghost batch norm"; + if None, normal batch norm is used; defualt is None +mlp_dropout : float, optional + whether and how much dropout to use between MLP linear layers; + `0.0 <= mlp_dropout < 1.0`; default is 0.0 +mlp_use_skip : boolean, optional + use a side path in the MLP containing just the optional leaky gate + plus single linear layer; default is True +mlp_l1_reg : float, optional + value for l1 regularization of MLP weights; default is 0.0 +mlp_l2_reg : float, optional + value for l2 regularization of MLP weights; default is 0.0 +use_leaky_gate : boolean, optional + whether to include "leaky gate" layers; default is True +loss_fn : "auto" or PyTorch loss function, optional + default is "auto" +device : string or torch.device, optional + default is "cpu" + +""" + + +class LeakyGate(nn.Module): + """LeakyGate from https://github.com/jrfiedler/xynn. + + This performs an element-wise linear transformation followed by a chosen + activation; the default activation is nn.LeakyReLU. Fields may be + represented by individual values or vectors of values (i.e., embedded). + + Input needs to be shaped like (num_rows, num_fields) or + (num_rows, num_fields, embedding_size) + + Args: + input_size: input_size. + bias: if to use bias. + activation: activation function. + device: device. + """ + + def __init__( + self, + input_size: int, + bias: bool = True, + activation: Type[nn.Module] = nn.LeakyReLU, + device: Union[str, torch.device] = "cpu", + ): + super().__init__() + self.weight = nn.Parameter(torch.normal(mean=0, std=1.0, size=(1, input_size))) + self.bias = nn.Parameter(torch.zeros(size=(1, input_size)), requires_grad=bias) + self.activation = activation() + self.to(device) + + def forward(self, X: Tensor) -> Tensor: + """Transform the input tensor. + + Args: + X : torch.Tensor + + Returns: + torch.Tensor + """ + out = X + if len(X.shape) > 2: + out = out.reshape((X.shape[0], -1)) + out = out * self.weight + self.bias + if len(X.shape) > 2: + out = out.reshape(X.shape) + out = self.activation(out) + return out + + +def _initialized_tensor(*sizes): + weight = nn.Parameter(torch.Tensor(*sizes)) + nn.init.kaiming_uniform_(weight) + return weight + + +class AttnInteractionLayer(nn.Module): + """The attention interaction layer for the AutoInt model. + + Paper for the original AutoInt model: https://arxiv.org/pdf/1810.11921v2.pdf + + Args: + field_input_size : int + original embedding size for each field + field_output_size : int, optional + embedding size after transformation; default is 8 + num_heads : int, optional + number of attention heads; default is 2 + activation : subclass of torch.nn.Module or None, optional + applied to the W tensors; default is None + use_residual : bool, optional + default is True + dropout : float, optional + default is 0.1 + normalize : bool, optional + default is True + ghost_batch_size : int or None, optional + only used if `use_bn` is True; size of batch in "ghost batch norm"; + if None, normal batch norm is used; defualt is None + device : string or torch.device, optional + default is "cpu" + + """ + + def __init__( + self, + field_input_size: int, + field_output_size: int = 8, + num_heads: int = 2, + activation: Optional[Type[nn.Module]] = None, + use_residual: bool = True, + dropout: float = 0.1, + normalize: bool = True, + ghost_batch_size: Optional[int] = None, + device: Union[str, torch.device] = "cpu", + ): + super().__init__() + + self.use_residual = use_residual + + self.W_q = _initialized_tensor(field_input_size, field_output_size, num_heads) + self.W_k = _initialized_tensor(field_input_size, field_output_size, num_heads) + self.W_v = _initialized_tensor(field_input_size, field_output_size, num_heads) + + if use_residual: + self.W_r = _initialized_tensor(field_input_size, field_output_size * num_heads) + else: + self.W_r = None + + if activation: + self.w_act = activation() + else: + self.w_act = nn.Identity() + + if dropout > 0.0: + self.dropout = nn.Dropout(dropout) + else: + self.dropout = nn.Identity() + + if normalize: + self.layer_norm = nn.LayerNorm(field_output_size * num_heads) + else: + self.layer_norm = nn.Identity() + + self.to(device) + + def forward(self, x: Tensor) -> Tensor: + """Transform the input tensor with attention interaction. + + Args: + x : torch.Tensor + 3-d tensor; for example, embedded numeric and/or categorical values, + or the output of a previous attention interaction layer + + Returns: + torch.Tensor + + """ + # R : # rows + # F, D : # fields + # I : field embedding size in + # O : field embedding size out + # H : # heads + num_rows, num_fields, _ = x.shape # R, F, I + + # (R, F, I) * (I, O, H) -> (R, F, O, H) + qrys = torch.tensordot(x, self.w_act(self.W_q), dims=([-1], [0])) + keys = torch.tensordot(x, self.w_act(self.W_k), dims=([-1], [0])) + vals = torch.tensordot(x, self.w_act(self.W_v), dims=([-1], [0])) + if self.use_residual: + rsdl = torch.tensordot(x, self.w_act(self.W_r), dims=([-1], [0])) + + product = torch.einsum("rdoh,rfoh->rdfh", qrys, keys) # (R, F, F, H) + + alpha = F.softmax(product, dim=2) # (R, F, F, H) + alpha = self.dropout(alpha) + + # (R, F, F, H) * (R, F, O, H) -> (R, F, O, H) + out = torch.einsum("rfdh,rfoh->rfoh", alpha, vals) + out = out.reshape((num_rows, num_fields, -1)) # (R, F, O * H) + if self.use_residual: + out = out + rsdl # (R, F, O * H) + out = F.leaky_relu(out) + out = self.layer_norm(out) + + return out + + +class AttnInteractionBlock(nn.Module): + """A collection of AttnInteractionLayers, followed by an optional "leaky gate" and then a linear layer. + + This block is originally for the AutoInt model. + + Code from: https://github.com/jrfiedler/xynn + + Args: + field_input_size : int + original embedding size for each field + field_output_size : int, optional + embedding size after transformation; default is 8 + num_layers : int, optional + number of attention layers; default is 3 + num_heads : int, optional + number of attention heads per layer; default is 2 + activation : subclass of torch.nn.Module or None, optional + applied to the W tensors; default is None + use_residual : bool, optional + default is True + dropout : float, optional + default is 0.0 + normalize : bool, optional + default is True + ghost_batch_size : int or None, optional + only used if `use_bn` is True; size of batch in "ghost batch norm"; + if None, normal batch norm is used; defualt is None + device : string or torch.device, optional + default is "cpu" + """ + + def __init__( + self, + field_input_size: int, + field_output_size: int = 8, + num_layers: int = 3, + num_heads: int = 2, + activation: Optional[Type[nn.Module]] = None, + use_residual: bool = True, + dropout: float = 0.1, + normalize: bool = True, + ghost_batch_size: Optional[int] = None, + device: Union[str, torch.device] = "cpu", + ): + super().__init__() + + layers = [] + for _ in range(num_layers): + layers.append( + AttnInteractionLayer( + field_input_size, + field_output_size, + num_heads, + activation, + use_residual, + dropout, + normalize, + ghost_batch_size, + device, + ) + ) + field_input_size = field_output_size * num_heads + + self.layers = nn.Sequential(*layers) + self.to(device) + + def forward(self, x: Tensor) -> Tensor: + """Transform the input tensor. + + Args: + x : torch.Tensor + 3-d tensor, usually embedded numeric and/or categorical values + + Returns: + torch.Tensor + """ + out = self.layers(x) + return out diff --git a/lightautoml/ml_algo/torch_based/autoint/ghost_norm.py b/lightautoml/ml_algo/torch_based/autoint/ghost_norm.py new file mode 100644 index 00000000..da0cbc2a --- /dev/null +++ b/lightautoml/ml_algo/torch_based/autoint/ghost_norm.py @@ -0,0 +1,79 @@ +"""Module for Ghost Batch Norm and variations. + +Ghost Batch Norm: https://arxiv.org/pdf/1705.08741.pdf + +""" + +from math import ceil +from typing import Union + +import torch +from torch import Tensor +from torch import nn + + +class GhostNorm(nn.Module): + """Ghost Normalization. + + https://arxiv.org/pdf/1705.08741.pdf + + Args: + inner_norm : torch.nn.Module (initialiezd) + examples: `nn.BatchNorm1d`, `nn.LayerNorm` + virtual_batch_size : int + device : string or torch.device, optional + default is "cpu" + """ + + def __init__( + self, + inner_norm: nn.Module, + virtual_batch_size: int, + device: Union[str, torch.device] = "cpu", + ): + super().__init__() + self.virtual_batch_size = virtual_batch_size + self.inner_norm = inner_norm + self.to(device) + + def forward(self, x: Tensor) -> Tensor: + """Transform the input tensor. + + Args: + x : torch.Tensor + + Returns: + torch.Tensor + + """ + chunk_size = int(ceil(x.shape[0] / self.virtual_batch_size)) + chunk_norm = [self.inner_norm(chunk) for chunk in x.chunk(chunk_size, dim=0)] + return torch.cat(chunk_norm, dim=0) + + +class GhostBatchNorm(GhostNorm): + """Ghost Normalization, using BatchNorm1d as inner normalization. + + https://arxiv.org/pdf/1705.08741.pdf + + Args: + num_features : int + virtual_batch_size : int, optional + default is 64 + momentum : float, optional + default is 0.1 + device : string or torch.device, optional + default is "cpu" + """ + + def __init__( + self, + num_features: int, + virtual_batch_size: int = 64, + momentum: float = 0.1, + device: Union[str, torch.device] = "cpu", + ): + super().__init__( + inner_norm=nn.BatchNorm1d(num_features, momentum=momentum), + virtual_batch_size=virtual_batch_size, + ) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index 119e0779..291a2587 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -1,13 +1,15 @@ """Torch models.""" from collections import OrderedDict -from typing import List +from typing import List, Tuple, Type from typing import Optional from typing import Union import numpy as np import torch import torch.nn as nn +from lightautoml.ml_algo.torch_based.autoint.autoint_utils import AttnInteractionBlock, LeakyGate +from lightautoml.ml_algo.torch_based.autoint.ghost_norm import GhostBatchNorm from lightautoml.ml_algo.torch_based.node_nn_model import DenseODSTBlock from lightautoml.ml_algo.torch_based.node_nn_model import Lambda @@ -80,20 +82,28 @@ def __init__( use_bn: bool = True, use_noise: bool = False, device: torch.device = torch.device("cuda:0"), + bn_momentum: float = 0.1, + ghost_batch: Optional[int] = None, **kwargs, ): super(DenseLightBlock, self).__init__() self.features = nn.Sequential(OrderedDict([])) - + self.features.add_module("dense", nn.Linear(n_in, n_out, bias=(not use_bn))) if use_bn: - self.features.add_module("norm", nn.BatchNorm1d(n_in)) + if ghost_batch is None: + self.features.add_module("norm", nn.BatchNorm1d(n_out, momentum=bn_momentum)) + else: + self.features.add_module("norm", GhostBatchNorm(n_out, ghost_batch, momentum=bn_momentum)) + + self.features.add_module("act", act_fun()) + if drop_rate: self.features.add_module("dropout", nn.Dropout(p=drop_rate)) if use_noise: self.features.add_module("noise", GaussianNoise(noise_std, device)) - self.features.add_module("dense", nn.Linear(n_in, n_out)) - self.features.add_module("act", act_fun()) + # self.features.add_module("dense", nn.Linear(n_in, n_out)) + # self.features.add_module("act", act_fun()) def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward-pass.""" @@ -115,9 +125,14 @@ class DenseLightModel(nn.Module): num_init_features: If not none add fc layer before model with certain dim. use_bn: Use BatchNorm. use_noise: Use noise. - concat_input: Concatenate input to all hidden layers. + concat_input: Concatenate input to all hidden layers. # MLP False + dropout_first: Use dropout in the first layer or not. + bn_momentum: BatchNorm momentum + ghost_batch: If not none use GhoastNorm with ghost_batch. + leaky_gate: Use LeakyGate or not. + use_skip: Use another Linear model to blend them after. + weighted_sum: Use weighted blender or half-half. device: Device to compute on. - """ def __init__( @@ -129,21 +144,32 @@ def __init__( 750, ], drop_rate: Union[float, List[float]] = 0.1, - act_fun: nn.Module = nn.ReLU, + act_fun: nn.Module = nn.LeakyReLU, noise_std: float = 0.05, num_init_features: Optional[int] = None, use_bn: bool = True, use_noise: bool = False, concat_input: bool = True, + dropout_first: bool = True, + bn_momentum: float = 0.1, + ghost_batch: Optional[int] = 64, + leaky_gate: bool = True, + use_skip: bool = True, + weighted_sum: bool = True, device: torch.device = torch.device("cuda:0"), **kwargs, ): super(DenseLightModel, self).__init__() + if isinstance(hidden_size, int): + hidden_size = [hidden_size] + if isinstance(drop_rate, float): - drop_rate = [drop_rate] * len(hidden_size) + drop_rate = [drop_rate] * (len(hidden_size) + (1 if dropout_first else 0)) - assert len(hidden_size) == len(drop_rate), "Wrong number hidden_sizes/drop_rates. Must be equal." + assert ( + len(hidden_size) == len(drop_rate) if not dropout_first else 1 + len(hidden_size) == len(drop_rate) + ), "Wrong number hidden_sizes/drop_rates. Must be equal." self.concat_input = concat_input num_features = n_in if num_init_features is None else num_init_features @@ -152,6 +178,13 @@ def __init__( if num_init_features is not None: self.features.add_module("dense0", nn.Linear(n_in, num_features)) + if leaky_gate: + self.features.add_module("leakygate0", LeakyGate(n_in)) + + if dropout_first and drop_rate[0] > 0: + self.features.add_module("dropout0", nn.Dropout(drop_rate[0])) + drop_rate = drop_rate[1:] + for i, hid_size in enumerate(hidden_size): block = DenseLightBlock( n_in=num_features, @@ -162,6 +195,8 @@ def __init__( use_bn=use_bn, use_noise=use_noise, device=device, + bn_momentum=bn_momentum, + ghost_batch=ghost_batch, ) self.features.add_module("denseblock%d" % (i + 1), block) @@ -172,16 +207,35 @@ def __init__( num_features = hidden_size[-1] self.fc = nn.Linear(num_features, n_out) + self.use_skip = use_skip + if use_skip: + skip_linear = nn.Linear(n_in, n_out) + if leaky_gate: + self.skip_layers = nn.Sequential(LeakyGate(n_in), skip_linear) + else: + self.skip_layers = skip_linear + if weighted_sum: + self.mix = nn.Parameter(torch.tensor([0.0])) + else: + self.mix = torch.tensor([0.0], device=device) + else: + self.skip_layers = None + self.mix = None - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, X: torch.Tensor) -> torch.Tensor: """Forward-pass.""" + x = X input = x.detach().clone() for name, layer in self.features.named_children(): if name != "denseblock1" and name != "dense0" and self.concat_input: x = torch.cat([x, input], 1) x = layer(x) - x = self.fc(x) - return x + out = self.fc(x) + if self.use_skip: + mix = torch.sigmoid(self.mix) + skip_out = self.skip_layers(X) + out = mix * skip_out + (1 - mix) * out + return out class MLP(DenseLightModel): @@ -197,8 +251,13 @@ class MLP(DenseLightModel): num_init_features: If not none add fc layer before model with certain dim. use_bn: Use BatchNorm. use_noise: Use noise. + dropout_first: Use dropout in the first layer or not. + bn_momentum: BatchNorm momentum + ghost_batch: If not none use GhoastNorm with ghost_batch. + leaky_gate: Use LeakyGate or not. + use_skip: Use another Linear model to blend them after. + weighted_sum: Use weighted blender or half-half. device: Device to compute on. - """ def __init__(self, *args, **kwargs): @@ -796,3 +855,124 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.features1(x) x = self.features2(x) return x.view(x.shape[0], -1) + + +class AutoInt(nn.Module): + """The NODE model from https://github.com/Qwicen. + + Args: + n_in: Input dim. + n_out: Output dim. + layer_dim: num trees in one layer. + num_layers: number of forests. + tree_dim: number of response channels in the response of individual tree. + use_original_head use averaging as a head or put linear layer instead. + depth: number of splits in every tree. + drop_rate: Dropout rate for each layer altogether. + act_fun: Activation function. + num_init_features: If not none add fc layer before model with certain dim. + use_bn: Use BatchNorm. + """ + + def __init__( + self, + n_in: int, + embedding_size: int, + n_out: int = 1, + attn_embedding_size: int = 8, + attn_num_layers: int = 3, + attn_num_heads: int = 2, + attn_activation: Optional[Type[nn.Module]] = None, + attn_use_residual: bool = True, + attn_dropout: float = 0.1, + attn_normalize: bool = True, + attn_use_mlp: bool = True, + mlp_hidden_sizes: Union[int, Tuple[int, ...], List[int]] = (512, 256, 128, 64), + mlp_activation: Type[nn.Module] = nn.LeakyReLU, + mlp_use_bn: bool = True, + mlp_bn_momentum: float = 0.1, + mlp_ghost_batch: Optional[int] = 16, + mlp_dropout: float = 0.0, + mlp_use_skip: bool = True, + use_leaky_gate: bool = True, + weighted_sum: bool = True, + device: Union[str, torch.device] = "cpu", + **kwargs, + ): + super(AutoInt, self).__init__() + super().__init__() + device = torch.device(device) + + if use_leaky_gate: + self.attn_gate = LeakyGate(n_in * embedding_size, device=device) + else: + self.attn_gate = nn.Identity() + + self.attn_interact = AttnInteractionBlock( + field_input_size=embedding_size, + field_output_size=attn_embedding_size, + num_layers=attn_num_layers, + num_heads=attn_num_heads, + activation=attn_activation, + use_residual=attn_use_residual, + dropout=attn_dropout, + normalize=attn_normalize, + ghost_batch_size=mlp_ghost_batch, + device=device, + ) + + self.attn_final = MLP( + n_in=n_in * attn_embedding_size * attn_num_heads, + hidden_size=(mlp_hidden_sizes if mlp_hidden_sizes and attn_use_mlp else []), + n_out=n_out, + act_fun=mlp_activation, + drop_rate=mlp_dropout, + use_bn=mlp_use_bn, + bn_momentum=mlp_bn_momentum, + ghost_batch=mlp_ghost_batch, + leaky_gate=use_leaky_gate, + use_skip=mlp_use_skip, + device=device, + ) + + if mlp_hidden_sizes: + self.mlp = MLP( + n_in=n_in * embedding_size, + hidden_size=mlp_hidden_sizes, + n_out=n_out, + act_fun=mlp_activation, + drop_rate=mlp_dropout, + use_bn=mlp_use_bn, + bn_momentum=mlp_bn_momentum, + ghost_batch=mlp_ghost_batch, + leaky_gate=use_leaky_gate, + use_skip=mlp_use_skip, + device=device, + ) + if weighted_sum: + self.mix = nn.Parameter(torch.tensor([0.0], device=device)) + else: + self.mix = torch.tensor([0.0], device=device) + else: + self.mlp = None + self.mix = None + + def forward(self, embedded: torch.Tensor) -> torch.Tensor: + """Transform the input tensor. + + Args: + embedded : torch.Tensor + embedded fields + + Returns: + torch.Tensor + + """ + out = self.attn_gate(embedded) + out = self.attn_interact(out) + out = self.attn_final(out.reshape((out.shape[0], -1))) + if self.mlp is not None: + embedded_2d = embedded.reshape((embedded.shape[0], -1)) + mix = torch.sigmoid(self.mix) + out = mix * out + (1 - mix) * self.mlp(embedded_2d) + return out diff --git a/lightautoml/text/nn_model.py b/lightautoml/text/nn_model.py index 276a60cd..8366af82 100644 --- a/lightautoml/text/nn_model.py +++ b/lightautoml/text/nn_model.py @@ -2,17 +2,18 @@ import logging -from typing import Any +from typing import Any, List, Tuple, Type from typing import Callable from typing import Dict from typing import Optional from typing import Sequence from typing import Union - +from functools import reduce import numpy as np import torch import torch.nn as nn - +from torch import Tensor +import operator try: from transformers import AutoModel @@ -175,11 +176,7 @@ class CatEmbedder(nn.Module): """ def __init__( - self, - cat_dims: Sequence[int], - emb_dropout: bool = 0.1, - emb_ratio: int = 3, - max_emb_size: int = 50, + self, cat_dims: Sequence[int], emb_dropout: bool = 0.1, emb_ratio: int = 3, max_emb_size: int = 50, **kwargs ): super(CatEmbedder, self).__init__() emb_dims = [(int(x), int(min(max_emb_size, max(1, (x + 1) // emb_ratio)))) for x in cat_dims] @@ -219,7 +216,7 @@ class ContEmbedder(nn.Module): """ - def __init__(self, num_dims: int, input_bn: bool = True): + def __init__(self, num_dims: int, input_bn: bool = True, **kwargs): super(ContEmbedder, self).__init__() self.n_out = num_dims self.bn = nn.Identity() @@ -243,6 +240,321 @@ def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: return output +class BasicEmbedding(nn.Module): + """A basic embedding that creates an embedded vector for each field value from https://github.com/jrfiedler/xynn. + + Args: + embedding_size : int, optional + size of each value's embedding vector; default is 10 + device : string or torch.device + + """ + + def __init__( + self, cat_vc: Sequence[Dict], embedding_size: int = 10, device: Union[str, torch.device] = "cuda:0", **kwargs + ): + super().__init__() + self._device = device + self._isfit = False + self.num_fields = 0 + self.output_size = 0 + self.lookup: Dict[Tuple[int, Any], int] = {} + self.lookup_nan: Dict[int, int] = {} + self.num_values = 0 + self.embedding: Optional[nn.Embedding] = None + self.embedding_size = embedding_size + self._from_summary(cat_vc) + self.cat_len = len(cat_vc) + + def _from_summary(self, uniques: List[Union[List, Tensor, np.ndarray]]): + lookup = {} + lookup_nan = {} + num_values = 0 + for fieldnum, field in enumerate(uniques): + for value in field: + if (fieldnum, value) in lookup: + # extra defense against repeated values + continue + lookup[(fieldnum, value)] = num_values + num_values += 1 + + self.num_fields = len(uniques) + self.output_size = self.num_fields * self.embedding_size + self.lookup = lookup + self.lookup_nan = lookup_nan + self.num_values = num_values + self.embedding = nn.Embedding(num_values, self.embedding_size) + nn.init.xavier_uniform_(self.embedding.weight) + self._isfit = True + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + return self.cat_len + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + + """ + if not self._isfit: + raise RuntimeError("need to call `fit` or `from_summary` first") + X = X["cat"] + idxs: List[List[int]] = [] + for row in X: + idxs.append([]) + for col, val in enumerate(row): + val = val.item() + idx = self.lookup[(col, val)] + idxs[-1].append(idx) + + return self.embedding(torch.tensor(idxs, dtype=torch.int64, device=self._device)) + + +class DefaultEmbedding(nn.Module): + """DefaultEmbedding from https://github.com/jrfiedler/xynn. + + An embedding with a default value for each field. The default is returned for + any field value not seen when the embedding was initialized (using `fit` or + `from_summary`). For any value seen at initialization, a weighted average of + that value's embedding and the default embedding is returned. The weights for + the average are determined by the parameter `alpha`: + + weight = count / (count + alpha) + final = embedding * weight + default * (1 - weight) + + Args: + embedding_size : int, optional + size of each value's embedding vector; default is 10 + alpha : int, optional + controls the weighting of each embedding vector with the default; + when `alpha`-many values are seen at initialization; the final + vector is evenly weighted; the influence of the default is decreased + with either higher counts or lower `alpha`; default is 20 + device : string or torch.device + + """ + + def __init__( + self, + cat_vc: Sequence[Dict], + embedding_size: int = 10, + alpha: int = 20, + device: Union[str, torch.device] = "cuda:0", + **kwargs, + ): + super().__init__() + self._isfit = False + self._device = device + self.num_fields = 0 + self.output_size = 0 + self.alpha = alpha + self.lookup: Dict[Tuple[int, Any], Tuple[int, int]] = {} + self.lookup_default: Dict[int, Tuple[int, int]] = {} + self.num_values = 0 + self.embedding: Optional[nn.Embedding] = None + self.embedding_size = embedding_size + self._from_summary(cat_vc) + self.cat_len = len(cat_vc) + + def _from_summary(self, unique_counts: List[Dict[Any, int]]): + lookup = {} + lookup_default = {} + num_values = 0 + for fieldnum, counts in enumerate(unique_counts): + lookup_default[fieldnum] = (num_values, 0) + num_values += 1 + for value, count in counts.items(): + lookup[(fieldnum, value)] = (num_values, count) + num_values += 1 + + self.num_fields = len(unique_counts) + self.output_size = self.num_fields * self.embedding_size + self.lookup = lookup + self.lookup_default = lookup_default + self.num_values = num_values + self.embedding = nn.Embedding(num_values, self.embedding_size) + nn.init.xavier_uniform_(self.embedding.weight) + + self._isfit = True + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + return self.cat_len + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + """ + if not self._isfit: + raise RuntimeError("need to call `fit` or `from_summary` first") + X = X["cat"] + list_weights: List[List[List[float]]] = [] + idxs_primary: List[List[int]] = [] + idxs_default: List[List[int]] = [] + for row in X: + list_weights.append([]) + idxs_primary.append([]) + idxs_default.append([]) + for col, val in enumerate(row): + val = val.item() + default = self.lookup_default[col] + idx, count = self.lookup.get((col, val), default) + list_weights[-1].append([count / (count + self.alpha)]) + idxs_primary[-1].append(idx) + idxs_default[-1].append(default[0]) + tsr_weights = torch.tensor(list_weights, dtype=torch.float32, device=self._device) + emb_primary = self.embedding(torch.tensor(idxs_primary, dtype=torch.int64, device=self._device)) + emb_default = self.embedding(torch.tensor(idxs_default, dtype=torch.int64, device=self._device)) + x = tsr_weights * emb_primary + (1 - tsr_weights) * emb_default + return x + + +class LinearEmbedding(nn.Module): + """An embedding for numeric fields from https://github.com/jrfiedler/xynn. + + There is one embedded vector for each field. + The embedded vector for a value is that value times its field's vector. + + Args: + embedding_size : int, optional + size of each value's embedding vector; default is 10 + device : string or torch.device + + """ + + def __init__(self, num_dims: int, embedding_size: int = 10, **kwargs): + super().__init__() + self._isfit = False + self.num_fields = num_dims + self.output_size = 0 + self.embedding: Optional[nn.Embedding] = None + self.embedding_size = embedding_size + self._from_summary(self.num_fields) + + def _from_summary(self, num_fields: int): + self.num_fields = num_fields + self.output_size = num_fields * self.embedding_size + self.embedding = nn.Embedding(num_fields, self.embedding_size) + nn.init.xavier_uniform_(self.embedding.weight) + self._isfit = True + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + return self.num_fields + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + + """ + X = X["cont"] + if not self._isfit: + raise RuntimeError("need to call `fit` or `from_summary` first") + return self.embedding.weight * X.unsqueeze(dim=-1) + + +class DenseEmbedding(nn.Module): + """An embedding for numeric fields, consisting of just a linear transformation with an activation from https://github.com/jrfiedler/xynn. + + Maps an input with shape n_rows * n_fields to an output with shape + n_rows * 1 * embedding_size if one value passed for embedding_size or + n_rows * embeddin_size[0] * embedding_size[1] if two values are passed + + Args: + embedding_size : int, tuple of ints, or list of ints; optional + size of each value's embedding vector; default is 10 + activation : subclass of torch.nn.Module, optional + default is nn.LeakyReLU + device : string or torch.device + """ + + def __init__( + self, + num_dims: int, + embedding_size: Union[int, Tuple[int, ...], List[int]] = 10, + activation: Type[nn.Module] = nn.LeakyReLU, + **kwargs, + ): + super().__init__() + + if isinstance(embedding_size, int): + embedding_size = (1, embedding_size) + elif len(embedding_size) == 1: + embedding_size = (1, embedding_size[0]) + self._isfit = False + self.num_fields = num_dims + self.output_size = 0 + self.embedding_w = None + self.embedding_b = None + self.dense_out_size = embedding_size + self.embedding_size = embedding_size[-1] + self.activation = activation() + self._from_summary(self.num_fields) + + def _from_summary(self, num_fields: int): + self.output_size = reduce(operator.mul, self.dense_out_size, 1) + self.embedding_w = nn.Parameter(torch.zeros((num_fields, *self.dense_out_size))) + self.embedding_b = nn.Parameter(torch.zeros(self.dense_out_size)) + nn.init.xavier_uniform_(self.embedding_w) + self._isfit = True + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + return self.dense_out_size[0] + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + + """ + X = X["cont"] + if not self._isfit: + raise RuntimeError("need to call `fit` or `from_summary` first") + embedded = self.embedding_w.T.matmul(X.T.to(dtype=torch.float)).T + self.embedding_b + embedded = self.activation(embedded.reshape((X.shape[0], -1))) + return embedded.reshape((X.shape[0], *self.dense_out_size)) + + class TorchUniversalModel(nn.Module): """Mixed data model. @@ -305,7 +617,12 @@ def __init__( torch_model( **{ **kwargs, - **{"n_in": n_in, "n_out": n_out, "loss": loss, "task": task}, + **{ + "n_in": n_in, + "n_out": n_out, + "loss": loss, + "task": task, + }, } ) if torch_model is not None From eaa90017780b58aff80989de5fdd861309abfbc1 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Thu, 10 Aug 2023 14:16:47 +0000 Subject: [PATCH 10/49] added flatten versions of embeddings --- lightautoml/ml_algo/dl_model.py | 6 +- lightautoml/text/embed.py | 546 ++++++++++++++++++++++++++++++++ lightautoml/text/nn_model.py | 470 +-------------------------- 3 files changed, 550 insertions(+), 472 deletions(-) create mode 100644 lightautoml/text/embed.py diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 8b968f35..49be49f0 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -44,9 +44,9 @@ from ..ml_algo.base import TabularDataset from ..ml_algo.base import TabularMLAlgo from ..pipelines.utils import get_columns_by_role -from ..text.nn_model import CatEmbedder, DefaultEmbedding, DenseEmbedding, LinearEmbedding, BasicEmbedding -from ..text.nn_model import ContEmbedder -from ..text.nn_model import TextBert +from ..text.embed import CatEmbedder, DefaultEmbedding, DenseEmbedding, LinearEmbedding, BasicEmbedding +from ..text.embed import ContEmbedder +from ..text.embed import TextBert from ..text.nn_model import TorchUniversalModel from ..text.nn_model import UniversalDataset from ..text.trainer import Trainer diff --git a/lightautoml/text/embed.py b/lightautoml/text/embed.py new file mode 100644 index 00000000..eaa15558 --- /dev/null +++ b/lightautoml/text/embed.py @@ -0,0 +1,546 @@ +"""Neural Net modules for differen data types.""" + +import logging + +from typing import Any, List, Tuple, Type +from typing import Dict +from typing import Optional +from typing import Sequence +from typing import Union +from functools import reduce +import numpy as np +import torch +import torch.nn as nn +from torch import Tensor +import operator + +try: + from transformers import AutoModel +except: + import warnings + + warnings.warn("'transformers' - package isn't installed") + +from .dl_transformers import pooling_by_name + + +logger = logging.getLogger(__name__) + + +class TextBert(nn.Module): + """Text data model. + + Class for working with text data based on HuggingFace transformers. + + Args: + model_name: Transformers model name. + pooling: Pooling type. + + Note: + There are different pooling types: + + - cls: Use CLS token for sentence embedding + from last hidden state. + - max: Maximum on seq_len dimension for non masked + inputs from last hidden state. + - mean: Mean on seq_len dimension for non masked + inputs from last hidden state. + - sum: Sum on seq_len dimension for non masked + inputs from last hidden state. + - none: Without pooling for seq2seq models. + + """ + + _poolers = {"cls", "max", "mean", "sum", "none"} + + def __init__(self, model_name: str = "bert-base-uncased", pooling: str = "cls"): + super(TextBert, self).__init__() + if pooling not in self._poolers: + raise ValueError("pooling - {} - not in the list of available types {}".format(pooling, self._poolers)) + + self.transformer = AutoModel.from_pretrained(model_name) + self.n_out = self.transformer.config.hidden_size + self.dropout = torch.nn.Dropout(0.2) + self.activation = torch.nn.ReLU(inplace=True) + self.pooling = pooling_by_name[pooling]() + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + return self.n_out + + def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: + """Forward-pass.""" + # last hidden layer + encoded_layers, _ = self.transformer( + input_ids=inp["input_ids"], + attention_mask=inp["attention_mask"], + token_type_ids=inp.get("token_type_ids"), + return_dict=False, + ) + + # pool the outputs into a vector + encoded_layers = self.pooling(encoded_layers, inp["attention_mask"].unsqueeze(-1).bool()) + mean_last_hidden_state = self.activation(encoded_layers) + mean_last_hidden_state = self.dropout(mean_last_hidden_state) + return mean_last_hidden_state + + +class CatEmbedder(nn.Module): + """Category data model. + + Args: + cat_dims: Sequence with number of unique categories + for category features. + emb_dropout: Dropout probability. + emb_ratio: Ratio for embedding size = (x + 1) // emb_ratio. + max_emb_size: Max embedding size. + + """ + + def __init__( + self, cat_dims: Sequence[int], emb_dropout: bool = 0.1, emb_ratio: int = 3, max_emb_size: int = 50, **kwargs + ): + super(CatEmbedder, self).__init__() + emb_dims = [(int(x), int(min(max_emb_size, max(1, (x + 1) // emb_ratio)))) for x in cat_dims] + self.no_of_embs = sum([y for x, y in emb_dims]) + assert self.no_of_embs != 0, "The input is empty." + # Embedding layers + self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims]) + self.emb_dropout_layer = nn.Dropout(emb_dropout) if emb_dropout else nn.Identity() + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + Int with module output shape. + + """ + return self.no_of_embs + + def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: + """Forward-pass.""" + output = torch.cat( + [emb_layer(inp["cat"][:, i]) for i, emb_layer in enumerate(self.emb_layers)], + dim=1, + ) + output = self.emb_dropout_layer(output) + return output + + +class ContEmbedder(nn.Module): + """Numeric data model. + + Class for working with numeric data. + + Args: + num_dims: Sequence with number of numeric features. + input_bn: Use 1d batch norm for input data. + + """ + + def __init__(self, num_dims: int, input_bn: bool = True, **kwargs): + super(ContEmbedder, self).__init__() + self.n_out = num_dims + self.bn = nn.Identity() + if input_bn: + self.bn = nn.BatchNorm1d(num_dims) + assert num_dims != 0, "The input is empty." + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + return self.n_out + + def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: + """Forward-pass.""" + output = inp["cont"] + output = self.bn(output) + return output + + +class BasicEmbedding(nn.Module): + """A basic embedding that creates an embedded vector for each field value from https://github.com/jrfiedler/xynn. + + Args: + embedding_size : int, optional + size of each value's embedding vector; default is 10 + device : string or torch.device + flatten_output: if flatten output or not. + + """ + + def __init__( + self, + cat_vc: Sequence[Dict], + embedding_size: int = 10, + device: Union[str, torch.device] = "cuda:0", + flatten_output: bool = False, + **kwargs, + ): + super().__init__() + self.flatten_output = flatten_output + self._device = device + self._isfit = False + self.num_fields = 0 + self.output_size = 0 + self.lookup: Dict[Tuple[int, Any], int] = {} + self.lookup_nan: Dict[int, int] = {} + self.num_values = 0 + self.embedding: Optional[nn.Embedding] = None + self.embedding_size = embedding_size + self._from_summary(cat_vc) + self.cat_len = len(cat_vc) + + def _from_summary(self, uniques: List[Union[List, Tensor, np.ndarray]]): + lookup = {} + lookup_nan = {} + num_values = 0 + for fieldnum, field in enumerate(uniques): + for value in field: + if (fieldnum, value) in lookup: + # extra defense against repeated values + continue + lookup[(fieldnum, value)] = num_values + num_values += 1 + + self.num_fields = len(uniques) + self.output_size = self.num_fields * self.embedding_size + self.lookup = lookup + self.lookup_nan = lookup_nan + self.num_values = num_values + self.embedding = nn.Embedding(num_values, self.embedding_size) + nn.init.xavier_uniform_(self.embedding.weight) + self._isfit = True + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + if self.flatten_output: + return self.cat_len * self.embedding_size + else: + return self.cat_len + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + + """ + if not self._isfit: + raise RuntimeError("need to call `fit` or `from_summary` first") + X = X["cat"] + idxs: List[List[int]] = [] + for row in X: + idxs.append([]) + for col, val in enumerate(row): + val = val.item() + idx = self.lookup[(col, val)] + idxs[-1].append(idx) + x = self.embedding(torch.tensor(idxs, dtype=torch.int64, device=self._device)) + if self.flatten_output: + return x.view(x.shape[0], -1) + return x + + +class DefaultEmbedding(nn.Module): + """DefaultEmbedding from https://github.com/jrfiedler/xynn. + + An embedding with a default value for each field. The default is returned for + any field value not seen when the embedding was initialized (using `fit` or + `from_summary`). For any value seen at initialization, a weighted average of + that value's embedding and the default embedding is returned. The weights for + the average are determined by the parameter `alpha`: + + weight = count / (count + alpha) + final = embedding * weight + default * (1 - weight) + + Args: + embedding_size : int, optional + size of each value's embedding vector; default is 10 + alpha : int, optional + controls the weighting of each embedding vector with the default; + when `alpha`-many values are seen at initialization; the final + vector is evenly weighted; the influence of the default is decreased + with either higher counts or lower `alpha`; default is 20 + device : string or torch.device + flatten_output: if flatten output or not. + + """ + + def __init__( + self, + cat_vc: Sequence[Dict], + embedding_size: int = 10, + alpha: int = 20, + device: Union[str, torch.device] = "cuda:0", + flatten_output: bool = False, + **kwargs, + ): + super().__init__() + self.flatten_output = flatten_output + self._isfit = False + self._device = device + self.num_fields = 0 + self.output_size = 0 + self.alpha = alpha + self.lookup: Dict[Tuple[int, Any], Tuple[int, int]] = {} + self.lookup_default: Dict[int, Tuple[int, int]] = {} + self.num_values = 0 + self.embedding: Optional[nn.Embedding] = None + self.embedding_size = embedding_size + self._from_summary(cat_vc) + self.cat_len = len(cat_vc) + + def _from_summary(self, unique_counts: List[Dict[Any, int]]): + lookup = {} + lookup_default = {} + num_values = 0 + for fieldnum, counts in enumerate(unique_counts): + lookup_default[fieldnum] = (num_values, 0) + num_values += 1 + for value, count in counts.items(): + lookup[(fieldnum, value)] = (num_values, count) + num_values += 1 + + self.num_fields = len(unique_counts) + self.output_size = self.num_fields * self.embedding_size + self.lookup = lookup + self.lookup_default = lookup_default + self.num_values = num_values + self.embedding = nn.Embedding(num_values, self.embedding_size) + nn.init.xavier_uniform_(self.embedding.weight) + + self._isfit = True + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + if self.flatten_output: + return self.cat_len * self.embedding_size + else: + return self.cat_len + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + """ + if not self._isfit: + raise RuntimeError("need to call `fit` or `from_summary` first") + X = X["cat"] + list_weights: List[List[List[float]]] = [] + idxs_primary: List[List[int]] = [] + idxs_default: List[List[int]] = [] + for row in X: + list_weights.append([]) + idxs_primary.append([]) + idxs_default.append([]) + for col, val in enumerate(row): + val = val.item() + default = self.lookup_default[col] + idx, count = self.lookup.get((col, val), default) + list_weights[-1].append([count / (count + self.alpha)]) + idxs_primary[-1].append(idx) + idxs_default[-1].append(default[0]) + tsr_weights = torch.tensor(list_weights, dtype=torch.float32, device=self._device) + emb_primary = self.embedding(torch.tensor(idxs_primary, dtype=torch.int64, device=self._device)) + emb_default = self.embedding(torch.tensor(idxs_default, dtype=torch.int64, device=self._device)) + x = tsr_weights * emb_primary + (1 - tsr_weights) * emb_default + if self.flatten_output: + return x.view(x.shape[0], -1) + return x + + +class LinearEmbedding(nn.Module): + """An embedding for numeric fields from https://github.com/jrfiedler/xynn. + + There is one embedded vector for each field. + The embedded vector for a value is that value times its field's vector. + + Args: + embedding_size : int, optional + size of each value's embedding vector; default is 10 + device : string or torch.device + flatten_output: if flatten output or not. + + """ + + def __init__(self, num_dims: int, embedding_size: int = 10, flatten_output: bool = False, **kwargs): + super().__init__() + self.flatten_output = flatten_output + self._isfit = False + self.num_fields = num_dims + self.output_size = 0 + self.embedding: Optional[nn.Embedding] = None + self.embedding_size = embedding_size + self._from_summary(self.num_fields) + + def _from_summary(self, num_fields: int): + self.num_fields = num_fields + self.output_size = num_fields * self.embedding_size + self.embedding = nn.Embedding(num_fields, self.embedding_size) + nn.init.xavier_uniform_(self.embedding.weight) + self._isfit = True + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + if self.flatten_output: + return self.num_fields * self.embedding_size + else: + return self.num_fields + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + + """ + X = X["cont"] + if not self._isfit: + raise RuntimeError("need to call `fit` or `from_summary` first") + x = self.embedding.weight * X.unsqueeze(dim=-1) + if self.flatten_output: + return x.view(x.shape[0], -1) + return x + + +class DenseEmbedding(nn.Module): + """An embedding for numeric fields, consisting of just a linear transformation with an activation from https://github.com/jrfiedler/xynn. + + Maps an input with shape n_rows * n_fields to an output with shape + n_rows * 1 * embedding_size if one value passed for embedding_size or + n_rows * embeddin_size[0] * embedding_size[1] if two values are passed + + Args: + embedding_size : int, tuple of ints, or list of ints; optional + size of each value's embedding vector; default is 10 + activation : subclass of torch.nn.Module, optional + default is nn.LeakyReLU + device : string or torch.device + flatten_output: if flatten output or not. + """ + + def __init__( + self, + num_dims: int, + embedding_size: Union[int, Tuple[int, ...], List[int]] = 10, + activation: Type[nn.Module] = nn.LeakyReLU, + flatten_output: bool = False, + **kwargs, + ): + super().__init__() + self.flatten_output = flatten_output + if isinstance(embedding_size, int): + embedding_size = (1, embedding_size) + elif len(embedding_size) == 1: + embedding_size = (1, embedding_size[0]) + self._isfit = False + self.num_fields = num_dims + self.output_size = 0 + self.embedding_w = None + self.embedding_b = None + self.dense_out_size = embedding_size + self.embedding_size = embedding_size[-1] + self.activation = activation() + self._from_summary(self.num_fields) + + def _from_summary(self, num_fields: int): + self.output_size = reduce(operator.mul, self.dense_out_size, 1) + self.embedding_w = nn.Parameter(torch.zeros((num_fields, *self.dense_out_size))) + self.embedding_b = nn.Parameter(torch.zeros(self.dense_out_size)) + nn.init.xavier_uniform_(self.embedding_w) + self._isfit = True + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + if self.flatten_output: + return self.output_size + else: + return self.dense_out_size[0] + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + + """ + X = X["cont"] + if not self._isfit: + raise RuntimeError("need to call `fit` or `from_summary` first") + embedded = self.embedding_w.T.matmul(X.T.to(dtype=torch.float)).T + self.embedding_b + embedded = self.activation(embedded.reshape((X.shape[0], -1))) + x = embedded.reshape((X.shape[0], *self.dense_out_size)) + if self.flatten_output: + return x.view(x.shape[0], -1) + return x + + +class DenseEmbeddingFlat(DenseEmbedding): + """Flatten version of DenseEmbedding.""" + + def __init__(self, *args, **kwargs): + super(DenseEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) + + +class LinearEmbeddingFlat(LinearEmbedding): + """Flatten version of LinearEmbedding.""" + + def __init__(self, *args, **kwargs): + super(LinearEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) + + +class DefaultEmbeddingFlat(DefaultEmbedding): + """Flatten version of DefaultEmbedding.""" + + def __init__(self, *args, **kwargs): + super(DefaultEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) + + +class BasicEmbeddingFlat(BasicEmbedding): + """Flatten version of BasicEmbedding.""" + + def __init__(self, *args, **kwargs): + super(BasicEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) diff --git a/lightautoml/text/nn_model.py b/lightautoml/text/nn_model.py index 8366af82..916cfec6 100644 --- a/lightautoml/text/nn_model.py +++ b/lightautoml/text/nn_model.py @@ -2,28 +2,15 @@ import logging -from typing import Any, List, Tuple, Type +from typing import Any from typing import Callable from typing import Dict from typing import Optional -from typing import Sequence from typing import Union -from functools import reduce import numpy as np import torch import torch.nn as nn -from torch import Tensor -import operator - -try: - from transformers import AutoModel -except: - import warnings - - warnings.warn("'transformers' - package isn't installed") - from ..tasks.base import Task -from .dl_transformers import pooling_by_name logger = logging.getLogger(__name__) @@ -100,461 +87,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -class TextBert(nn.Module): - """Text data model. - - Class for working with text data based on HuggingFace transformers. - - Args: - model_name: Transformers model name. - pooling: Pooling type. - - Note: - There are different pooling types: - - - cls: Use CLS token for sentence embedding - from last hidden state. - - max: Maximum on seq_len dimension for non masked - inputs from last hidden state. - - mean: Mean on seq_len dimension for non masked - inputs from last hidden state. - - sum: Sum on seq_len dimension for non masked - inputs from last hidden state. - - none: Without pooling for seq2seq models. - - """ - - _poolers = {"cls", "max", "mean", "sum", "none"} - - def __init__(self, model_name: str = "bert-base-uncased", pooling: str = "cls"): - super(TextBert, self).__init__() - if pooling not in self._poolers: - raise ValueError("pooling - {} - not in the list of available types {}".format(pooling, self._poolers)) - - self.transformer = AutoModel.from_pretrained(model_name) - self.n_out = self.transformer.config.hidden_size - self.dropout = torch.nn.Dropout(0.2) - self.activation = torch.nn.ReLU(inplace=True) - self.pooling = pooling_by_name[pooling]() - - def get_out_shape(self) -> int: - """Output shape. - - Returns: - int with module output shape. - - """ - return self.n_out - - def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: - """Forward-pass.""" - # last hidden layer - encoded_layers, _ = self.transformer( - input_ids=inp["input_ids"], - attention_mask=inp["attention_mask"], - token_type_ids=inp.get("token_type_ids"), - return_dict=False, - ) - - # pool the outputs into a vector - encoded_layers = self.pooling(encoded_layers, inp["attention_mask"].unsqueeze(-1).bool()) - mean_last_hidden_state = self.activation(encoded_layers) - mean_last_hidden_state = self.dropout(mean_last_hidden_state) - return mean_last_hidden_state - - -class CatEmbedder(nn.Module): - """Category data model. - - Args: - cat_dims: Sequence with number of unique categories - for category features. - emb_dropout: Dropout probability. - emb_ratio: Ratio for embedding size = (x + 1) // emb_ratio. - max_emb_size: Max embedding size. - - """ - - def __init__( - self, cat_dims: Sequence[int], emb_dropout: bool = 0.1, emb_ratio: int = 3, max_emb_size: int = 50, **kwargs - ): - super(CatEmbedder, self).__init__() - emb_dims = [(int(x), int(min(max_emb_size, max(1, (x + 1) // emb_ratio)))) for x in cat_dims] - self.no_of_embs = sum([y for x, y in emb_dims]) - assert self.no_of_embs != 0, "The input is empty." - # Embedding layers - self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims]) - self.emb_dropout_layer = nn.Dropout(emb_dropout) if emb_dropout else nn.Identity() - - def get_out_shape(self) -> int: - """Output shape. - - Returns: - Int with module output shape. - - """ - return self.no_of_embs - - def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: - """Forward-pass.""" - output = torch.cat( - [emb_layer(inp["cat"][:, i]) for i, emb_layer in enumerate(self.emb_layers)], - dim=1, - ) - output = self.emb_dropout_layer(output) - return output - - -class ContEmbedder(nn.Module): - """Numeric data model. - - Class for working with numeric data. - - Args: - num_dims: Sequence with number of numeric features. - input_bn: Use 1d batch norm for input data. - - """ - - def __init__(self, num_dims: int, input_bn: bool = True, **kwargs): - super(ContEmbedder, self).__init__() - self.n_out = num_dims - self.bn = nn.Identity() - if input_bn: - self.bn = nn.BatchNorm1d(num_dims) - assert num_dims != 0, "The input is empty." - - def get_out_shape(self) -> int: - """Output shape. - - Returns: - int with module output shape. - - """ - return self.n_out - - def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: - """Forward-pass.""" - output = inp["cont"] - output = self.bn(output) - return output - - -class BasicEmbedding(nn.Module): - """A basic embedding that creates an embedded vector for each field value from https://github.com/jrfiedler/xynn. - - Args: - embedding_size : int, optional - size of each value's embedding vector; default is 10 - device : string or torch.device - - """ - - def __init__( - self, cat_vc: Sequence[Dict], embedding_size: int = 10, device: Union[str, torch.device] = "cuda:0", **kwargs - ): - super().__init__() - self._device = device - self._isfit = False - self.num_fields = 0 - self.output_size = 0 - self.lookup: Dict[Tuple[int, Any], int] = {} - self.lookup_nan: Dict[int, int] = {} - self.num_values = 0 - self.embedding: Optional[nn.Embedding] = None - self.embedding_size = embedding_size - self._from_summary(cat_vc) - self.cat_len = len(cat_vc) - - def _from_summary(self, uniques: List[Union[List, Tensor, np.ndarray]]): - lookup = {} - lookup_nan = {} - num_values = 0 - for fieldnum, field in enumerate(uniques): - for value in field: - if (fieldnum, value) in lookup: - # extra defense against repeated values - continue - lookup[(fieldnum, value)] = num_values - num_values += 1 - - self.num_fields = len(uniques) - self.output_size = self.num_fields * self.embedding_size - self.lookup = lookup - self.lookup_nan = lookup_nan - self.num_values = num_values - self.embedding = nn.Embedding(num_values, self.embedding_size) - nn.init.xavier_uniform_(self.embedding.weight) - self._isfit = True - - def get_out_shape(self) -> int: - """Output shape. - - Returns: - int with module output shape. - - """ - return self.cat_len - - def forward(self, X: Dict) -> Tensor: - """Produce embedding for each value in input. - - Args: - X : Dict - - Returns: - torch.Tensor - - """ - if not self._isfit: - raise RuntimeError("need to call `fit` or `from_summary` first") - X = X["cat"] - idxs: List[List[int]] = [] - for row in X: - idxs.append([]) - for col, val in enumerate(row): - val = val.item() - idx = self.lookup[(col, val)] - idxs[-1].append(idx) - - return self.embedding(torch.tensor(idxs, dtype=torch.int64, device=self._device)) - - -class DefaultEmbedding(nn.Module): - """DefaultEmbedding from https://github.com/jrfiedler/xynn. - - An embedding with a default value for each field. The default is returned for - any field value not seen when the embedding was initialized (using `fit` or - `from_summary`). For any value seen at initialization, a weighted average of - that value's embedding and the default embedding is returned. The weights for - the average are determined by the parameter `alpha`: - - weight = count / (count + alpha) - final = embedding * weight + default * (1 - weight) - - Args: - embedding_size : int, optional - size of each value's embedding vector; default is 10 - alpha : int, optional - controls the weighting of each embedding vector with the default; - when `alpha`-many values are seen at initialization; the final - vector is evenly weighted; the influence of the default is decreased - with either higher counts or lower `alpha`; default is 20 - device : string or torch.device - - """ - - def __init__( - self, - cat_vc: Sequence[Dict], - embedding_size: int = 10, - alpha: int = 20, - device: Union[str, torch.device] = "cuda:0", - **kwargs, - ): - super().__init__() - self._isfit = False - self._device = device - self.num_fields = 0 - self.output_size = 0 - self.alpha = alpha - self.lookup: Dict[Tuple[int, Any], Tuple[int, int]] = {} - self.lookup_default: Dict[int, Tuple[int, int]] = {} - self.num_values = 0 - self.embedding: Optional[nn.Embedding] = None - self.embedding_size = embedding_size - self._from_summary(cat_vc) - self.cat_len = len(cat_vc) - - def _from_summary(self, unique_counts: List[Dict[Any, int]]): - lookup = {} - lookup_default = {} - num_values = 0 - for fieldnum, counts in enumerate(unique_counts): - lookup_default[fieldnum] = (num_values, 0) - num_values += 1 - for value, count in counts.items(): - lookup[(fieldnum, value)] = (num_values, count) - num_values += 1 - - self.num_fields = len(unique_counts) - self.output_size = self.num_fields * self.embedding_size - self.lookup = lookup - self.lookup_default = lookup_default - self.num_values = num_values - self.embedding = nn.Embedding(num_values, self.embedding_size) - nn.init.xavier_uniform_(self.embedding.weight) - - self._isfit = True - - def get_out_shape(self) -> int: - """Output shape. - - Returns: - int with module output shape. - - """ - return self.cat_len - - def forward(self, X: Dict) -> Tensor: - """Produce embedding for each value in input. - - Args: - X : Dict - - Returns: - torch.Tensor - """ - if not self._isfit: - raise RuntimeError("need to call `fit` or `from_summary` first") - X = X["cat"] - list_weights: List[List[List[float]]] = [] - idxs_primary: List[List[int]] = [] - idxs_default: List[List[int]] = [] - for row in X: - list_weights.append([]) - idxs_primary.append([]) - idxs_default.append([]) - for col, val in enumerate(row): - val = val.item() - default = self.lookup_default[col] - idx, count = self.lookup.get((col, val), default) - list_weights[-1].append([count / (count + self.alpha)]) - idxs_primary[-1].append(idx) - idxs_default[-1].append(default[0]) - tsr_weights = torch.tensor(list_weights, dtype=torch.float32, device=self._device) - emb_primary = self.embedding(torch.tensor(idxs_primary, dtype=torch.int64, device=self._device)) - emb_default = self.embedding(torch.tensor(idxs_default, dtype=torch.int64, device=self._device)) - x = tsr_weights * emb_primary + (1 - tsr_weights) * emb_default - return x - - -class LinearEmbedding(nn.Module): - """An embedding for numeric fields from https://github.com/jrfiedler/xynn. - - There is one embedded vector for each field. - The embedded vector for a value is that value times its field's vector. - - Args: - embedding_size : int, optional - size of each value's embedding vector; default is 10 - device : string or torch.device - - """ - - def __init__(self, num_dims: int, embedding_size: int = 10, **kwargs): - super().__init__() - self._isfit = False - self.num_fields = num_dims - self.output_size = 0 - self.embedding: Optional[nn.Embedding] = None - self.embedding_size = embedding_size - self._from_summary(self.num_fields) - - def _from_summary(self, num_fields: int): - self.num_fields = num_fields - self.output_size = num_fields * self.embedding_size - self.embedding = nn.Embedding(num_fields, self.embedding_size) - nn.init.xavier_uniform_(self.embedding.weight) - self._isfit = True - - def get_out_shape(self) -> int: - """Output shape. - - Returns: - int with module output shape. - - """ - return self.num_fields - - def forward(self, X: Dict) -> Tensor: - """Produce embedding for each value in input. - - Args: - X : Dict - - Returns: - torch.Tensor - - """ - X = X["cont"] - if not self._isfit: - raise RuntimeError("need to call `fit` or `from_summary` first") - return self.embedding.weight * X.unsqueeze(dim=-1) - - -class DenseEmbedding(nn.Module): - """An embedding for numeric fields, consisting of just a linear transformation with an activation from https://github.com/jrfiedler/xynn. - - Maps an input with shape n_rows * n_fields to an output with shape - n_rows * 1 * embedding_size if one value passed for embedding_size or - n_rows * embeddin_size[0] * embedding_size[1] if two values are passed - - Args: - embedding_size : int, tuple of ints, or list of ints; optional - size of each value's embedding vector; default is 10 - activation : subclass of torch.nn.Module, optional - default is nn.LeakyReLU - device : string or torch.device - """ - - def __init__( - self, - num_dims: int, - embedding_size: Union[int, Tuple[int, ...], List[int]] = 10, - activation: Type[nn.Module] = nn.LeakyReLU, - **kwargs, - ): - super().__init__() - - if isinstance(embedding_size, int): - embedding_size = (1, embedding_size) - elif len(embedding_size) == 1: - embedding_size = (1, embedding_size[0]) - self._isfit = False - self.num_fields = num_dims - self.output_size = 0 - self.embedding_w = None - self.embedding_b = None - self.dense_out_size = embedding_size - self.embedding_size = embedding_size[-1] - self.activation = activation() - self._from_summary(self.num_fields) - - def _from_summary(self, num_fields: int): - self.output_size = reduce(operator.mul, self.dense_out_size, 1) - self.embedding_w = nn.Parameter(torch.zeros((num_fields, *self.dense_out_size))) - self.embedding_b = nn.Parameter(torch.zeros(self.dense_out_size)) - nn.init.xavier_uniform_(self.embedding_w) - self._isfit = True - - def get_out_shape(self) -> int: - """Output shape. - - Returns: - int with module output shape. - - """ - return self.dense_out_size[0] - - def forward(self, X: Dict) -> Tensor: - """Produce embedding for each value in input. - - Args: - X : Dict - - Returns: - torch.Tensor - - """ - X = X["cont"] - if not self._isfit: - raise RuntimeError("need to call `fit` or `from_summary` first") - embedded = self.embedding_w.T.matmul(X.T.to(dtype=torch.float)).T + self.embedding_b - embedded = self.activation(embedded.reshape((X.shape[0], -1))) - return embedded.reshape((X.shape[0], *self.dense_out_size)) - - class TorchUniversalModel(nn.Module): """Mixed data model. From 294383fef2ea6b8e79ced69c951deb00486d42d2 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 15 Aug 2023 08:41:04 +0000 Subject: [PATCH 11/49] not done still --- lightautoml/automl/presets/tabular_presets.py | 1 + lightautoml/ml_algo/dl_model.py | 7 +- lightautoml/ml_algo/tabnet/utils.py | 388 ++++++++ .../torch_based/autoint/autoint_utils.py | 46 - lightautoml/ml_algo/torch_based/nn_models.py | 118 ++- .../ml_algo/torch_based/node_nn_model.py | 33 +- .../pytorch_tabnet/abstract_model.py | 826 ++++++++++++++++ .../pytorch_tabnet/augmentations.py | 85 ++ .../torch_based/pytorch_tabnet/callbacks.py | 287 ++++++ .../torch_based/pytorch_tabnet/metrics.py | 523 ++++++++++ .../pytorch_tabnet/multiclass_utils.py | 425 ++++++++ .../torch_based/pytorch_tabnet/multitask.py | 178 ++++ .../torch_based/pytorch_tabnet/pretraining.py | 428 ++++++++ .../pytorch_tabnet/pretraining_utils.py | 128 +++ .../torch_based/pytorch_tabnet/sparsemax.py | 278 ++++++ .../torch_based/pytorch_tabnet/tab_model.py | 154 +++ .../torch_based/pytorch_tabnet/tab_network.py | 934 ++++++++++++++++++ .../torch_based/pytorch_tabnet/utils.py | 552 +++++++++++ 18 files changed, 5332 insertions(+), 59 deletions(-) create mode 100644 lightautoml/ml_algo/tabnet/utils.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py create mode 100755 lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py create mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py diff --git a/lightautoml/automl/presets/tabular_presets.py b/lightautoml/automl/presets/tabular_presets.py index cf4ba8fe..539b2df4 100755 --- a/lightautoml/automl/presets/tabular_presets.py +++ b/lightautoml/automl/presets/tabular_presets.py @@ -609,6 +609,7 @@ def create_automl(self, **fit_args): "node", "autoint", "autoint_emb_v2", + "tabnet" ] available_nn_models = available_nn_models + [x + "_tuned" for x in available_nn_models] nn_models = [ diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 49be49f0..fc4b38ef 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -44,7 +44,7 @@ from ..ml_algo.base import TabularDataset from ..ml_algo.base import TabularMLAlgo from ..pipelines.utils import get_columns_by_role -from ..text.embed import CatEmbedder, DefaultEmbedding, DenseEmbedding, LinearEmbedding, BasicEmbedding +from ..text.embed import BasicEmbeddingFlat, CatEmbedder, DefaultEmbedding, DenseEmbedding, LinearEmbedding, BasicEmbedding, LinearEmbeddingFlat from ..text.embed import ContEmbedder from ..text.embed import TextBert from ..text.nn_model import TorchUniversalModel @@ -56,7 +56,7 @@ from ..text.utils import is_shuffle from ..text.utils import parse_devices from ..text.utils import seed_everything -from .torch_based.nn_models import MLP +from .torch_based.nn_models import MLP, TabNet from .torch_based.nn_models import NODE from .torch_based.nn_models import SNN from .torch_based.nn_models import DenseLightModel @@ -80,6 +80,7 @@ "node": NODE, "autoint": AutoInt, "autoint_emb_v2": AutoInt, + "tabnet":TabNet, } cat_embedder_by_name = { "denselight": CatEmbedder, @@ -92,6 +93,7 @@ "node": CatEmbedder, "autoint": BasicEmbedding, "autoint_emb_v2": DefaultEmbedding, + "tabnet":BasicEmbeddingFlat, } cont_embedder_params_by_name = { "denselight": ContEmbedder, @@ -104,6 +106,7 @@ "node": ContEmbedder, "autoint": LinearEmbedding, "autoint_emb_v2": DenseEmbedding, + "tabnet":LinearEmbeddingFlat, } diff --git a/lightautoml/ml_algo/tabnet/utils.py b/lightautoml/ml_algo/tabnet/utils.py new file mode 100644 index 00000000..9dad6259 --- /dev/null +++ b/lightautoml/ml_algo/tabnet/utils.py @@ -0,0 +1,388 @@ +"""Utils for pytorch-tabnet model.""" +import torch +import numpy as np +import torch.nn as nn +from lightautoml.ml_algo.torch_based.node_nn_model import Entmax15, Sparsemax, sparsemax,entmax15 +from lightautoml.ml_algo.torch_based.autoint.ghost_norm import GhostBatchNorm + + +def initialize_non_glu(module, input_dim, output_dim): + gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(4 * input_dim)) + torch.nn.init.xavier_normal_(module.weight, gain=gain_value) + # torch.nn.init.zeros_(module.bias) + return + + +def initialize_glu(module, input_dim, output_dim): + gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(input_dim)) + torch.nn.init.xavier_normal_(module.weight, gain=gain_value) + # torch.nn.init.zeros_(module.bias) + return + + + + + +class TabNetEncoder(torch.nn.Module): + def __init__( + self, + input_dim, + output_dim, + n_d=8, + n_a=8, + n_steps=3, + gamma=1.3, + n_independent=2, + n_shared=2, + epsilon=1e-15, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + group_attention_matrix=None, + ): + """ + Defines main part of the TabNet network without the embedding layers. + + Parameters + ---------- + input_dim : int + Number of features + output_dim : int or list of int for multi task classification + Dimension of network output + examples : one for regression, 2 for binary classification etc... + n_d : int + Dimension of the prediction layer (usually between 4 and 64) + n_a : int + Dimension of the attention layer (usually between 4 and 64) + n_steps : int + Number of successive steps in the network (usually between 3 and 10) + gamma : float + Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0) + n_independent : int + Number of independent GLU layer in each GLU block (default 2) + n_shared : int + Number of independent GLU layer in each GLU block (default 2) + epsilon : float + Avoid log(0), this should be kept very low + virtual_batch_size : int + Batch size for Ghost Batch Normalization + momentum : float + Float value between 0 and 1 which will be used for momentum in all batch norm + mask_type : str + Either "sparsemax" or "entmax" : this is the masking function to use + group_attention_matrix : torch matrix + Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j + """ + super(TabNetEncoder, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.is_multi_task = isinstance(output_dim, list) + self.n_d = n_d + self.n_a = n_a + self.n_steps = n_steps + self.gamma = gamma + self.epsilon = epsilon + self.n_independent = n_independent + self.n_shared = n_shared + self.virtual_batch_size = virtual_batch_size + self.mask_type = mask_type + self.initial_bn = nn.BatchNorm1d(self.input_dim, momentum=0.01) + self.group_attention_matrix = group_attention_matrix + + if self.group_attention_matrix is None: + # no groups + self.group_attention_matrix = torch.eye(self.input_dim) + self.attention_dim = self.input_dim + else: + self.attention_dim = self.group_attention_matrix.shape[0] + + if self.n_shared > 0: + shared_feat_transform = torch.nn.ModuleList() + for i in range(self.n_shared): + if i == 0: + shared_feat_transform.append( + nn.Linear(self.input_dim, 2 * (n_d + n_a), bias=False) + ) + else: + shared_feat_transform.append( + nn.Linear(n_d + n_a, 2 * (n_d + n_a), bias=False) + ) + + else: + shared_feat_transform = None + + self.initial_splitter = FeatTransformer( + self.input_dim, + n_d + n_a, + shared_feat_transform, + n_glu_independent=self.n_independent, + virtual_batch_size=self.virtual_batch_size, + momentum=momentum, + ) + + self.feat_transformers = torch.nn.ModuleList() + self.att_transformers = torch.nn.ModuleList() + + for step in range(n_steps): + transformer = FeatTransformer( + self.input_dim, + n_d + n_a, + shared_feat_transform, + n_glu_independent=self.n_independent, + virtual_batch_size=self.virtual_batch_size, + momentum=momentum, + ) + attention = AttentiveTransformer( + n_a, + self.attention_dim, + group_matrix=group_attention_matrix, + virtual_batch_size=self.virtual_batch_size, + momentum=momentum, + mask_type=self.mask_type, + ) + self.feat_transformers.append(transformer) + self.att_transformers.append(attention) + + def forward(self, x, prior=None): + x = self.initial_bn(x) + + bs = x.shape[0] # batch size + if prior is None: + prior = torch.ones((bs, self.attention_dim)).to(x.device) + + M_loss = 0 + att = self.initial_splitter(x)[:, self.n_d :] + steps_output = [] + for step in range(self.n_steps): + M = self.att_transformers[step](prior, att) + M_loss += torch.mean( + torch.sum(torch.mul(M, torch.log(M + self.epsilon)), dim=1) + ) + # update prior + prior = torch.mul(self.gamma - M, prior) + # output + M_feature_level = torch.matmul(M, self.group_attention_matrix.to(x.device)) + masked_x = torch.mul(M_feature_level, x) + out = self.feat_transformers[step](masked_x) + d = nn.ReLU()(out[:, : self.n_d]) + steps_output.append(d) + # update attention + att = out[:, self.n_d :] + + M_loss /= self.n_steps + return steps_output, M_loss + + def forward_masks(self, x): + x = self.initial_bn(x) + bs = x.shape[0] # batch size + prior = torch.ones((bs, self.attention_dim)).to(x.device) + M_explain = torch.zeros(x.shape).to(x.device) + att = self.initial_splitter(x)[:, self.n_d :] + masks = {} + + for step in range(self.n_steps): + M = self.att_transformers[step](prior, att) + M_feature_level = torch.matmul(M, self.group_attention_matrix.to(x.device)) + masks[step] = M_feature_level + # update prior + prior = torch.mul(self.gamma - M, prior) + # output + masked_x = torch.mul(M_feature_level, x) + out = self.feat_transformers[step](masked_x) + d = nn.ReLU()(out[:, : self.n_d]) + # explain + step_importance = torch.sum(d, dim=1) + M_explain += torch.mul(M_feature_level, step_importance.unsqueeze(dim=1)) + # update attention + att = out[:, self.n_d :] + + return M_explain, masks + + + +class FeatTransformer(torch.nn.Module): + def __init__( + self, + input_dim, + output_dim, + shared_layers, + n_glu_independent, + virtual_batch_size=128, + momentum=0.02, + ): + super(FeatTransformer, self).__init__() + """ + Initialize a feature transformer. + + Parameters + ---------- + input_dim : int + Input size + output_dim : int + Output_size + shared_layers : torch.nn.ModuleList + The shared block that should be common to every step + n_glu_independent : int + Number of independent GLU layers + virtual_batch_size : int + Batch size for Ghost Batch Normalization within GLU block(s) + momentum : float + Float value between 0 and 1 which will be used for momentum in batch norm + """ + + params = { + "n_glu": n_glu_independent, + "virtual_batch_size": virtual_batch_size, + "momentum": momentum, + } + + if shared_layers is None: + # no shared layers + self.shared = torch.nn.Identity() + is_first = True + else: + self.shared = GLU_Block( + input_dim, + output_dim, + first=True, + shared_layers=shared_layers, + n_glu=len(shared_layers), + virtual_batch_size=virtual_batch_size, + momentum=momentum, + ) + is_first = False + + if n_glu_independent == 0: + # no independent layers + self.specifics = torch.nn.Identity() + else: + spec_input_dim = input_dim if is_first else output_dim + self.specifics = GLU_Block( + spec_input_dim, output_dim, first=is_first, **params + ) + + def forward(self, x): + x = self.shared(x) + x = self.specifics(x) + return x + + +class GLU_Block(torch.nn.Module): + """ + Independent GLU block, specific to each step + """ + + def __init__( + self, + input_dim, + output_dim, + n_glu=2, + first=False, + shared_layers=None, + virtual_batch_size=128, + momentum=0.02, + ): + super(GLU_Block, self).__init__() + self.first = first + self.shared_layers = shared_layers + self.n_glu = n_glu + self.glu_layers = torch.nn.ModuleList() + + params = {"virtual_batch_size": virtual_batch_size, "momentum": momentum} + + fc = shared_layers[0] if shared_layers else None + self.glu_layers.append(GLU_Layer(input_dim, output_dim, fc=fc, **params)) + for glu_id in range(1, self.n_glu): + fc = shared_layers[glu_id] if shared_layers else None + self.glu_layers.append(GLU_Layer(output_dim, output_dim, fc=fc, **params)) + + def forward(self, x): + scale = torch.sqrt(torch.FloatTensor([0.5]).to(x.device)) + if self.first: # the first layer of the block has no scale multiplication + x = self.glu_layers[0](x) + layers_left = range(1, self.n_glu) + else: + layers_left = range(self.n_glu) + + for glu_id in layers_left: + x = torch.add(x, self.glu_layers[glu_id](x)) + x = x * scale + return x + + + +class GLU_Layer(torch.nn.Module): + def __init__( + self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02 + ): + super(GLU_Layer, self).__init__() + + self.output_dim = output_dim + if fc: + self.fc = fc + else: + self.fc = nn.Linear(input_dim, 2 * output_dim, bias=False) + initialize_glu(self.fc, input_dim, 2 * output_dim) + + self.bn = GhostBatchNorm( + 2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum + ) + + def forward(self, x): + x = self.fc(x) + x = self.bn(x) + out = torch.mul(x[:, : self.output_dim], torch.sigmoid(x[:, self.output_dim :])) + return out + + + +class AttentiveTransformer(torch.nn.Module): + def __init__( + self, + input_dim, + group_dim, + group_matrix, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + ): + """ + Initialize an attention transformer. + + Parameters + ---------- + input_dim : int + Input size + group_dim : int + Number of groups for features + virtual_batch_size : int + Batch size for Ghost Batch Normalization + momentum : float + Float value between 0 and 1 which will be used for momentum in batch norm + mask_type : str + Either "sparsemax" or "entmax" : this is the masking function to use + """ + super(AttentiveTransformer, self).__init__() + self.fc = nn.Linear(input_dim, group_dim, bias=False) + initialize_non_glu(self.fc, input_dim, group_dim) + self.bn = GhostBatchNorm( + group_dim, virtual_batch_size=virtual_batch_size, momentum=momentum + ) + + if mask_type == "sparsemax": + # Sparsemax + self.selector = Sparsemax() + elif mask_type == "entmax": + # Entmax + self.selector = Entmax15() + else: + raise NotImplementedError( + "Please choose either sparsemax" + "or entmax as masktype" + ) + + def forward(self, priors, processed_feat): + x = self.fc(processed_feat) + x = self.bn(x) + x = torch.mul(x, priors) + x = self.selector(x) + return x \ No newline at end of file diff --git a/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py b/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py index c14944f5..c96b3241 100644 --- a/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py +++ b/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py @@ -12,52 +12,6 @@ EmbeddingInfo = namedtuple("EmbeddingInfo", ["num_fields", "output_size"]) UniformEmbeddingInfo = namedtuple("EmbeddingInfo", ["num_fields", "embedding_size", "output_size"]) -MODULE_INIT_DOC = """ -Parameters ----------- -output_size : int - number of final output values; i.e., number of targets for - regression or number of classes for classification -embedding_num : EmbeddingBase or None - initialized and fit embedding for numeric fields -embedding_cat : EmbeddingBase or None - initialized and fit embedding for categorical fields -embedding_l1_reg : float, optional - value for l1 regularization of embedding vectors; default is 0.0 -embedding_l2_reg : float, optional - value for l2 regularization of embedding vectors; default is 0.0 -{} -mlp_hidden_sizes : int or iterable of int, optional - sizes for the linear transformations between the MLP input and - the output size needed based on the target; default is (512, 256, 128, 64) -mlp_activation : subclass of torch.nn.Module (uninitialized), optional - default is nn.LeakyReLU -mlp_use_bn : boolean, optional - whether to use batch normalization between MLP linear layers; - default is True -mlp_bn_momentum : float, optional - only used if `mlp_use_bn` is True; default is 0.01 -mlp_ghost_batch : int or None, optional - only used if `mlp_use_bn` is True; size of batch in "ghost batch norm"; - if None, normal batch norm is used; defualt is None -mlp_dropout : float, optional - whether and how much dropout to use between MLP linear layers; - `0.0 <= mlp_dropout < 1.0`; default is 0.0 -mlp_use_skip : boolean, optional - use a side path in the MLP containing just the optional leaky gate - plus single linear layer; default is True -mlp_l1_reg : float, optional - value for l1 regularization of MLP weights; default is 0.0 -mlp_l2_reg : float, optional - value for l2 regularization of MLP weights; default is 0.0 -use_leaky_gate : boolean, optional - whether to include "leaky gate" layers; default is True -loss_fn : "auto" or PyTorch loss function, optional - default is "auto" -device : string or torch.device, optional - default is "cpu" - -""" class LeakyGate(nn.Module): diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index 291a2587..22512329 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -8,11 +8,11 @@ import numpy as np import torch import torch.nn as nn +from lightautoml.ml_algo.tabnet.utils import TabNetEncoder, initialize_non_glu from lightautoml.ml_algo.torch_based.autoint.autoint_utils import AttnInteractionBlock, LeakyGate from lightautoml.ml_algo.torch_based.autoint.ghost_norm import GhostBatchNorm -from lightautoml.ml_algo.torch_based.node_nn_model import DenseODSTBlock -from lightautoml.ml_algo.torch_based.node_nn_model import Lambda +from lightautoml.ml_algo.torch_based.node_nn_model import DenseODSTBlock, MeanPooling class GaussianNoise(nn.Module): @@ -840,7 +840,7 @@ def __init__( self.features1.add_module("ODSTForestblock%d", block) self.features2 = nn.Sequential(OrderedDict([])) if use_original_head: - last_layer = Lambda(lambda x: x[..., :n_out].mean(dim=-2)) + last_layer = MeanPooling(n_out,dim=-2) self.features2.add_module("head", last_layer) else: if use_bn: @@ -976,3 +976,115 @@ def forward(self, embedded: torch.Tensor) -> torch.Tensor: mix = torch.sigmoid(self.mix) out = mix * out + (1 - mix) * self.mlp(embedded_2d) return out + + + + +class TabNet(torch.nn.Module): + def __init__( + self, + n_in, + n_out, + n_d=8, + n_a=8, + n_steps=3, + gamma=1.3, + n_independent=2, + n_shared=2, + epsilon=1e-15, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + group_attention_matrix=None, + **kwargs, + ): + """ + Defines main part of the TabNet network without the embedding layers. + + Parameters + ---------- + input_dim : int + Number of features + output_dim : int or list of int for multi task classification + Dimension of network output + examples : one for regression, 2 for binary classification etc... + n_d : int + Dimension of the prediction layer (usually between 4 and 64) + n_a : int + Dimension of the attention layer (usually between 4 and 64) + n_steps : int + Number of successive steps in the network (usually between 3 and 10) + gamma : float + Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0) + n_independent : int + Number of independent GLU layer in each GLU block (default 2) + n_shared : int + Number of independent GLU layer in each GLU block (default 2) + epsilon : float + Avoid log(0), this should be kept very low + virtual_batch_size : int + Batch size for Ghost Batch Normalization + momentum : float + Float value between 0 and 1 which will be used for momentum in all batch norm + mask_type : str + Either "sparsemax" or "entmax" : this is the masking function to use + group_attention_matrix : torch matrix + Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j + """ + super(TabNet, self).__init__() + self.input_dim = n_in + self.output_dim = n_out + self.is_multi_task = isinstance(n_out, list) + self.n_d = n_d + self.n_a = n_a + self.n_steps = n_steps + self.gamma = gamma + self.epsilon = epsilon + self.n_independent = n_independent + self.n_shared = n_shared + self.virtual_batch_size = virtual_batch_size + self.mask_type = mask_type + self.initial_bn = nn.BatchNorm1d(self.input_dim, momentum=0.01) + + self.encoder = TabNetEncoder( + input_dim=n_in, + output_dim=n_out, + n_d=n_d, + n_a=n_a, + n_steps=n_steps, + gamma=gamma, + n_independent=n_independent, + n_shared=n_shared, + epsilon=epsilon, + virtual_batch_size=virtual_batch_size, + momentum=momentum, + mask_type=mask_type, + group_attention_matrix=group_attention_matrix + ) + + if self.is_multi_task: + self.multi_task_mappings = torch.nn.ModuleList() + for task_dim in n_out: + task_mapping = nn.Linear(n_d, task_dim, bias=False) + initialize_non_glu(task_mapping, n_d, task_dim) + self.multi_task_mappings.append(task_mapping) + else: + self.final_mapping = nn.Linear(n_d, n_out, bias=False) + initialize_non_glu(self.final_mapping, n_d, n_out) + + def forward(self, x): + res = 0 + steps_output, M_loss = self.encoder(x) + res = torch.sum(torch.stack(steps_output, dim=0), dim=0) + + if self.is_multi_task: + # Result will be in list format + out = [] + for task_mapping in self.multi_task_mappings: + out.append(task_mapping(res)) + else: + out = self.final_mapping(res) + return out + + def forward_masks(self, x): + return self.encoder.forward_masks(x) \ No newline at end of file diff --git a/lightautoml/ml_algo/torch_based/node_nn_model.py b/lightautoml/ml_algo/torch_based/node_nn_model.py index cdfedbea..e414c6db 100644 --- a/lightautoml/ml_algo/torch_based/node_nn_model.py +++ b/lightautoml/ml_algo/torch_based/node_nn_model.py @@ -124,7 +124,14 @@ def _threshold_and_support(input, dim=-1): sparsemax = lambda input, dim=-1: SparsemaxFunction.apply(input, dim) # noqa: E731 sparsemoid = lambda input: (0.5 * input + 0.5).clamp_(0, 1) # noqa: E731 +class Sparsemax(nn.Module): + def __init__(self, dim=-1): + self.dim = dim + super(Sparsemax, self).__init__() + + def forward(self, input): + return SparsemaxFunction.apply(input, self.dim) class Entmax15Function(Function): """An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). @@ -256,28 +263,38 @@ def _backward(output, grad_output): entmax15 = lambda input, dim=-1: Entmax15Function.apply(input, dim) # noqa: E731 entmoid15 = Entmoid15.apply # noqa: E731 +class Entmax15(nn.Module): + def __init__(self, dim=-1): + self.dim = dim + super(Entmax15, self).__init__() + + def forward(self, input): + return Entmax15Function.apply(input, self.dim) -class Lambda(nn.Module): - """Pytorch implementation of lambda. +class MeanPooling(nn.Module): + """Pytorch implementation of MeanPooling head. Args: - func : returned func + n_out: int, output dim. + dim: int: the dimension to be averaged. + """ - def __init__(self, func): + def __init__(self, n_out, dim=-1): super().__init__() - self.func = func + self.n_out = n_out + self.dim = dim - def forward(self, *args, **kwargs): + def forward(self, x: torch.Tensor): """Forward-pass. # noqa: DAR101 Returns: - f(*args, **kwargs) + x[..., :self.n_out].mean(dim=self.dim) """ - return self.func(*args, **kwargs) + return x[..., :self.n_out].mean(dim=self.dim) class ModuleWithInit(nn.Module): diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py new file mode 100644 index 00000000..a1734439 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py @@ -0,0 +1,826 @@ +from dataclasses import dataclass, field +from typing import List, Any, Dict +import torch +from torch.nn.utils import clip_grad_norm_ +import numpy as np +from scipy.sparse import csc_matrix +from abc import abstractmethod +from pytorch_tabnet import tab_network +from pytorch_tabnet.utils import ( + SparsePredictDataset, + PredictDataset, + create_explain_matrix, + validate_eval_set, + create_dataloaders, + define_device, + ComplexEncoder, + check_input, + check_warm_start, + create_group_matrix, + check_embedding_parameters +) +from pytorch_tabnet.callbacks import ( + CallbackContainer, + History, + EarlyStopping, + LRSchedulerCallback, +) +from pytorch_tabnet.metrics import MetricContainer, check_metrics +from sklearn.base import BaseEstimator + +from torch.utils.data import DataLoader +import io +import json +from pathlib import Path +import shutil +import zipfile +import warnings +import copy +import scipy + + +@dataclass +class TabModel(BaseEstimator): + """ Class for TabNet model.""" + + n_d: int = 8 + n_a: int = 8 + n_steps: int = 3 + gamma: float = 1.3 + cat_idxs: List[int] = field(default_factory=list) + cat_dims: List[int] = field(default_factory=list) + cat_emb_dim: int = 1 + n_independent: int = 2 + n_shared: int = 2 + epsilon: float = 1e-15 + momentum: float = 0.02 + lambda_sparse: float = 1e-3 + seed: int = 0 + clip_value: int = 1 + verbose: int = 1 + optimizer_fn: Any = torch.optim.Adam + optimizer_params: Dict = field(default_factory=lambda: dict(lr=2e-2)) + scheduler_fn: Any = None + scheduler_params: Dict = field(default_factory=dict) + mask_type: str = "sparsemax" + input_dim: int = None + output_dim: int = None + device_name: str = "auto" + n_shared_decoder: int = 1 + n_indep_decoder: int = 1 + grouped_features: List[List[int]] = field(default_factory=list) + + def __post_init__(self): + # These are default values needed for saving model + self.batch_size = 1024 + self.virtual_batch_size = 128 + + torch.manual_seed(self.seed) + # Defining device + self.device = torch.device(define_device(self.device_name)) + if self.verbose != 0: + warnings.warn(f"Device used : {self.device}") + + # create deep copies of mutable parameters + self.optimizer_fn = copy.deepcopy(self.optimizer_fn) + self.scheduler_fn = copy.deepcopy(self.scheduler_fn) + + updated_params = check_embedding_parameters(self.cat_dims, + self.cat_idxs, + self.cat_emb_dim) + self.cat_dims, self.cat_idxs, self.cat_emb_dim = updated_params + + def __update__(self, **kwargs): + """ + Updates parameters. + If does not already exists, creates it. + Otherwise overwrite with warnings. + """ + update_list = [ + "cat_dims", + "cat_emb_dim", + "cat_idxs", + "input_dim", + "mask_type", + "n_a", + "n_d", + "n_independent", + "n_shared", + "n_steps", + "grouped_features", + ] + for var_name, value in kwargs.items(): + if var_name in update_list: + try: + exec(f"global previous_val; previous_val = self.{var_name}") + if previous_val != value: # noqa + wrn_msg = f"Pretraining: {var_name} changed from {previous_val} to {value}" # noqa + warnings.warn(wrn_msg) + exec(f"self.{var_name} = value") + except AttributeError: + exec(f"self.{var_name} = value") + + def fit( + self, + X_train, + y_train, + eval_set=None, + eval_name=None, + eval_metric=None, + loss_fn=None, + weights=0, + max_epochs=100, + patience=10, + batch_size=1024, + virtual_batch_size=128, + num_workers=0, + drop_last=True, + callbacks=None, + pin_memory=True, + from_unsupervised=None, + warm_start=False, + augmentations=None, + compute_importance=True + ): + """Train a neural network stored in self.network + Using train_dataloader for training data and + valid_dataloader for validation. + + Parameters + ---------- + X_train : np.ndarray + Train set + y_train : np.array + Train targets + eval_set : list of tuple + List of eval tuple set (X, y). + The last one is used for early stopping + eval_name : list of str + List of eval set names. + eval_metric : list of str + List of evaluation metrics. + The last metric is used for early stopping. + loss_fn : callable or None + a PyTorch loss function + weights : bool or dictionnary + 0 for no balancing + 1 for automated balancing + dict for custom weights per class + max_epochs : int + Maximum number of epochs during training + patience : int + Number of consecutive non improving epoch before early stopping + batch_size : int + Training batch size + virtual_batch_size : int + Batch size for Ghost Batch Normalization (virtual_batch_size < batch_size) + num_workers : int + Number of workers used in torch.utils.data.DataLoader + drop_last : bool + Whether to drop last batch during training + callbacks : list of callback function + List of custom callbacks + pin_memory: bool + Whether to set pin_memory to True or False during training + from_unsupervised: unsupervised trained model + Use a previously self supervised model as starting weights + warm_start: bool + If True, current model parameters are used to start training + compute_importance : bool + Whether to compute feature importance + """ + # update model name + + self.max_epochs = max_epochs + self.patience = patience + self.batch_size = batch_size + self.virtual_batch_size = virtual_batch_size + self.num_workers = num_workers + self.drop_last = drop_last + self.input_dim = X_train.shape[1] + self._stop_training = False + self.pin_memory = pin_memory and (self.device.type != "cpu") + self.augmentations = augmentations + self.compute_importance = compute_importance + + if self.augmentations is not None: + # This ensure reproducibility + self.augmentations._set_seed() + + eval_set = eval_set if eval_set else [] + + if loss_fn is None: + self.loss_fn = self._default_loss + else: + self.loss_fn = loss_fn + + check_input(X_train) + check_warm_start(warm_start, from_unsupervised) + + self.update_fit_params( + X_train, + y_train, + eval_set, + weights, + ) + + # Validate and reformat eval set depending on training data + eval_names, eval_set = validate_eval_set(eval_set, eval_name, X_train, y_train) + + train_dataloader, valid_dataloaders = self._construct_loaders( + X_train, y_train, eval_set + ) + + if from_unsupervised is not None: + # Update parameters to match self pretraining + self.__update__(**from_unsupervised.get_params()) + + if not hasattr(self, "network") or not warm_start: + # model has never been fitted before of warm_start is False + self._set_network() + self._update_network_params() + self._set_metrics(eval_metric, eval_names) + self._set_optimizer() + self._set_callbacks(callbacks) + + if from_unsupervised is not None: + self.load_weights_from_unsupervised(from_unsupervised) + warnings.warn("Loading weights from unsupervised pretraining") + # Call method on_train_begin for all callbacks + self._callback_container.on_train_begin() + + # Training loop over epochs + for epoch_idx in range(self.max_epochs): + + # Call method on_epoch_begin for all callbacks + self._callback_container.on_epoch_begin(epoch_idx) + + self._train_epoch(train_dataloader) + + # Apply predict epoch to all eval sets + for eval_name, valid_dataloader in zip(eval_names, valid_dataloaders): + self._predict_epoch(eval_name, valid_dataloader) + + # Call method on_epoch_end for all callbacks + self._callback_container.on_epoch_end( + epoch_idx, logs=self.history.epoch_metrics + ) + + if self._stop_training: + break + + # Call method on_train_end for all callbacks + self._callback_container.on_train_end() + self.network.eval() + + if self.compute_importance: + # compute feature importance once the best model is defined + self.feature_importances_ = self._compute_feature_importances(X_train) + + def predict(self, X): + """ + Make predictions on a batch (valid) + + Parameters + ---------- + X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` + Input data + + Returns + ------- + predictions : np.array + Predictions of the regression problem + """ + self.network.eval() + + if scipy.sparse.issparse(X): + dataloader = DataLoader( + SparsePredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + else: + dataloader = DataLoader( + PredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + + results = [] + for batch_nb, data in enumerate(dataloader): + data = data.to(self.device).float() + output, M_loss = self.network(data) + predictions = output.cpu().detach().numpy() + results.append(predictions) + res = np.vstack(results) + return self.predict_func(res) + + def explain(self, X, normalize=False): + """ + Return local explanation + + Parameters + ---------- + X : tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` + Input data + normalize : bool (default False) + Wheter to normalize so that sum of features are equal to 1 + + Returns + ------- + M_explain : matrix + Importance per sample, per columns. + masks : matrix + Sparse matrix showing attention masks used by network. + """ + self.network.eval() + + if scipy.sparse.issparse(X): + dataloader = DataLoader( + SparsePredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + else: + dataloader = DataLoader( + PredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + + res_explain = [] + + for batch_nb, data in enumerate(dataloader): + data = data.to(self.device).float() + + M_explain, masks = self.network.forward_masks(data) + for key, value in masks.items(): + masks[key] = csc_matrix.dot( + value.cpu().detach().numpy(), self.reducing_matrix + ) + original_feat_explain = csc_matrix.dot(M_explain.cpu().detach().numpy(), + self.reducing_matrix) + res_explain.append(original_feat_explain) + + if batch_nb == 0: + res_masks = masks + else: + for key, value in masks.items(): + res_masks[key] = np.vstack([res_masks[key], value]) + + res_explain = np.vstack(res_explain) + + if normalize: + res_explain /= np.sum(res_explain, axis=1)[:, None] + + return res_explain, res_masks + + def load_weights_from_unsupervised(self, unsupervised_model): + update_state_dict = copy.deepcopy(self.network.state_dict()) + for param, weights in unsupervised_model.network.state_dict().items(): + if param.startswith("encoder"): + # Convert encoder's layers name to match + new_param = "tabnet." + param + else: + new_param = param + if self.network.state_dict().get(new_param) is not None: + # update only common layers + update_state_dict[new_param] = weights + + self.network.load_state_dict(update_state_dict) + + def load_class_attrs(self, class_attrs): + for attr_name, attr_value in class_attrs.items(): + setattr(self, attr_name, attr_value) + + def save_model(self, path): + """Saving TabNet model in two distinct files. + + Parameters + ---------- + path : str + Path of the model. + + Returns + ------- + str + input filepath with ".zip" appended + + """ + saved_params = {} + init_params = {} + for key, val in self.get_params().items(): + if isinstance(val, type): + # Don't save torch specific params + continue + else: + init_params[key] = val + saved_params["init_params"] = init_params + + class_attrs = { + "preds_mapper": self.preds_mapper + } + saved_params["class_attrs"] = class_attrs + + # Create folder + Path(path).mkdir(parents=True, exist_ok=True) + + # Save models params + with open(Path(path).joinpath("model_params.json"), "w", encoding="utf8") as f: + json.dump(saved_params, f, cls=ComplexEncoder) + + # Save state_dict + torch.save(self.network.state_dict(), Path(path).joinpath("network.pt")) + shutil.make_archive(path, "zip", path) + shutil.rmtree(path) + print(f"Successfully saved model at {path}.zip") + return f"{path}.zip" + + def load_model(self, filepath): + """Load TabNet model. + + Parameters + ---------- + filepath : str + Path of the model. + """ + try: + with zipfile.ZipFile(filepath) as z: + with z.open("model_params.json") as f: + loaded_params = json.load(f) + loaded_params["init_params"]["device_name"] = self.device_name + with z.open("network.pt") as f: + try: + saved_state_dict = torch.load(f, map_location=self.device) + except io.UnsupportedOperation: + # In Python <3.7, the returned file object is not seekable (which at least + # some versions of PyTorch require) - so we'll try buffering it in to a + # BytesIO instead: + saved_state_dict = torch.load( + io.BytesIO(f.read()), + map_location=self.device, + ) + except KeyError: + raise KeyError("Your zip file is missing at least one component") + + self.__init__(**loaded_params["init_params"]) + + self._set_network() + self.network.load_state_dict(saved_state_dict) + self.network.eval() + self.load_class_attrs(loaded_params["class_attrs"]) + + return + + def _train_epoch(self, train_loader): + """ + Trains one epoch of the network in self.network + + Parameters + ---------- + train_loader : a :class: `torch.utils.data.Dataloader` + DataLoader with train set + """ + self.network.train() + + for batch_idx, (X, y) in enumerate(train_loader): + self._callback_container.on_batch_begin(batch_idx) + + batch_logs = self._train_batch(X, y) + + self._callback_container.on_batch_end(batch_idx, batch_logs) + + epoch_logs = {"lr": self._optimizer.param_groups[-1]["lr"]} + self.history.epoch_metrics.update(epoch_logs) + + return + + def _train_batch(self, X, y): + """ + Trains one batch of data + + Parameters + ---------- + X : torch.Tensor + Train matrix + y : torch.Tensor + Target matrix + + Returns + ------- + batch_outs : dict + Dictionnary with "y": target and "score": prediction scores. + batch_logs : dict + Dictionnary with "batch_size" and "loss". + """ + batch_logs = {"batch_size": X.shape[0]} + + X = X.to(self.device).float() + y = y.to(self.device).float() + + if self.augmentations is not None: + X, y = self.augmentations(X, y) + + for param in self.network.parameters(): + param.grad = None + + output, M_loss = self.network(X) + + loss = self.compute_loss(output, y) + # Add the overall sparsity loss + loss = loss - self.lambda_sparse * M_loss + + # Perform backward pass and optimization + loss.backward() + if self.clip_value: + clip_grad_norm_(self.network.parameters(), self.clip_value) + self._optimizer.step() + + batch_logs["loss"] = loss.cpu().detach().numpy().item() + + return batch_logs + + def _predict_epoch(self, name, loader): + """ + Predict an epoch and update metrics. + + Parameters + ---------- + name : str + Name of the validation set + loader : torch.utils.data.Dataloader + DataLoader with validation set + """ + # Setting network on evaluation mode + self.network.eval() + + list_y_true = [] + list_y_score = [] + + # Main loop + for batch_idx, (X, y) in enumerate(loader): + scores = self._predict_batch(X) + list_y_true.append(y) + list_y_score.append(scores) + + y_true, scores = self.stack_batches(list_y_true, list_y_score) + + metrics_logs = self._metric_container_dict[name](y_true, scores) + self.network.train() + self.history.epoch_metrics.update(metrics_logs) + return + + def _predict_batch(self, X): + """ + Predict one batch of data. + + Parameters + ---------- + X : torch.Tensor + Owned products + + Returns + ------- + np.array + model scores + """ + X = X.to(self.device).float() + + # compute model output + scores, _ = self.network(X) + + if isinstance(scores, list): + scores = [x.cpu().detach().numpy() for x in scores] + else: + scores = scores.cpu().detach().numpy() + + return scores + + def _set_network(self): + """Setup the network and explain matrix.""" + torch.manual_seed(self.seed) + + self.group_matrix = create_group_matrix(self.grouped_features, self.input_dim) + + self.network = tab_network.TabNet( + self.input_dim, + self.output_dim, + n_d=self.n_d, + n_a=self.n_a, + n_steps=self.n_steps, + gamma=self.gamma, + cat_idxs=self.cat_idxs, + cat_dims=self.cat_dims, + cat_emb_dim=self.cat_emb_dim, + n_independent=self.n_independent, + n_shared=self.n_shared, + epsilon=self.epsilon, + virtual_batch_size=self.virtual_batch_size, + momentum=self.momentum, + mask_type=self.mask_type, + group_attention_matrix=self.group_matrix.to(self.device), + ).to(self.device) + + self.reducing_matrix = create_explain_matrix( + self.network.input_dim, + self.network.cat_emb_dim, + self.network.cat_idxs, + self.network.post_embed_dim, + ) + + def _set_metrics(self, metrics, eval_names): + """Set attributes relative to the metrics. + + Parameters + ---------- + metrics : list of str + List of eval metric names. + eval_names : list of str + List of eval set names. + + """ + metrics = metrics or [self._default_metric] + + metrics = check_metrics(metrics) + # Set metric container for each sets + self._metric_container_dict = {} + for name in eval_names: + self._metric_container_dict.update( + {name: MetricContainer(metrics, prefix=f"{name}_")} + ) + + self._metrics = [] + self._metrics_names = [] + for _, metric_container in self._metric_container_dict.items(): + self._metrics.extend(metric_container.metrics) + self._metrics_names.extend(metric_container.names) + + # Early stopping metric is the last eval metric + self.early_stopping_metric = ( + self._metrics_names[-1] if len(self._metrics_names) > 0 else None + ) + + def _set_callbacks(self, custom_callbacks): + """Setup the callbacks functions. + + Parameters + ---------- + custom_callbacks : list of func + List of callback functions. + + """ + # Setup default callbacks history, early stopping and scheduler + callbacks = [] + self.history = History(self, verbose=self.verbose) + callbacks.append(self.history) + if (self.early_stopping_metric is not None) and (self.patience > 0): + early_stopping = EarlyStopping( + early_stopping_metric=self.early_stopping_metric, + is_maximize=( + self._metrics[-1]._maximize if len(self._metrics) > 0 else None + ), + patience=self.patience, + ) + callbacks.append(early_stopping) + else: + wrn_msg = "No early stopping will be performed, last training weights will be used." + warnings.warn(wrn_msg) + + if self.scheduler_fn is not None: + # Add LR Scheduler call_back + is_batch_level = self.scheduler_params.pop("is_batch_level", False) + scheduler = LRSchedulerCallback( + scheduler_fn=self.scheduler_fn, + scheduler_params=self.scheduler_params, + optimizer=self._optimizer, + early_stopping_metric=self.early_stopping_metric, + is_batch_level=is_batch_level, + ) + callbacks.append(scheduler) + + if custom_callbacks: + callbacks.extend(custom_callbacks) + self._callback_container = CallbackContainer(callbacks) + self._callback_container.set_trainer(self) + + def _set_optimizer(self): + """Setup optimizer.""" + self._optimizer = self.optimizer_fn( + self.network.parameters(), **self.optimizer_params + ) + + def _construct_loaders(self, X_train, y_train, eval_set): + """Generate dataloaders for train and eval set. + + Parameters + ---------- + X_train : np.array + Train set. + y_train : np.array + Train targets. + eval_set : list of tuple + List of eval tuple set (X, y). + + Returns + ------- + train_dataloader : `torch.utils.data.Dataloader` + Training dataloader. + valid_dataloaders : list of `torch.utils.data.Dataloader` + List of validation dataloaders. + + """ + # all weights are not allowed for this type of model + y_train_mapped = self.prepare_target(y_train) + for i, (X, y) in enumerate(eval_set): + y_mapped = self.prepare_target(y) + eval_set[i] = (X, y_mapped) + + train_dataloader, valid_dataloaders = create_dataloaders( + X_train, + y_train_mapped, + eval_set, + self.updated_weights, + self.batch_size, + self.num_workers, + self.drop_last, + self.pin_memory, + ) + return train_dataloader, valid_dataloaders + + def _compute_feature_importances(self, X): + """Compute global feature importance. + + Parameters + ---------- + loader : `torch.utils.data.Dataloader` + Pytorch dataloader. + + """ + M_explain, _ = self.explain(X, normalize=False) + sum_explain = M_explain.sum(axis=0) + feature_importances_ = sum_explain / np.sum(sum_explain) + return feature_importances_ + + def _update_network_params(self): + self.network.virtual_batch_size = self.virtual_batch_size + + @abstractmethod + def update_fit_params(self, X_train, y_train, eval_set, weights): + """ + Set attributes relative to fit function. + + Parameters + ---------- + X_train : np.ndarray + Train set + y_train : np.array + Train targets + eval_set : list of tuple + List of eval tuple set (X, y). + weights : bool or dictionnary + 0 for no balancing + 1 for automated balancing + """ + raise NotImplementedError( + "users must define update_fit_params to use this base class" + ) + + @abstractmethod + def compute_loss(self, y_score, y_true): + """ + Compute the loss. + + Parameters + ---------- + y_score : a :tensor: `torch.Tensor` + Score matrix + y_true : a :tensor: `torch.Tensor` + Target matrix + + Returns + ------- + float + Loss value + """ + raise NotImplementedError( + "users must define compute_loss to use this base class" + ) + + @abstractmethod + def prepare_target(self, y): + """ + Prepare target before training. + + Parameters + ---------- + y : a :tensor: `torch.Tensor` + Target matrix. + + Returns + ------- + `torch.Tensor` + Converted target matrix. + """ + raise NotImplementedError( + "users must define prepare_target to use this base class" + ) diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py new file mode 100644 index 00000000..287fa365 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py @@ -0,0 +1,85 @@ +import torch +from pytorch_tabnet.utils import define_device +import numpy as np + + +class RegressionSMOTE(): + """ + Apply SMOTE + + This will average a percentage p of the elements in the batch with other elements. + The target will be averaged as well (this might work with binary classification + and certain loss), following a beta distribution. + """ + def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0): + "" + self.seed = seed + self._set_seed() + self.device = define_device(device_name) + self.alpha = alpha + self.beta = beta + self.p = p + if (p < 0.) or (p > 1.0): + raise ValueError("Value of p should be between 0. and 1.") + + def _set_seed(self): + torch.manual_seed(self.seed) + np.random.seed(self.seed) + return + + def __call__(self, X, y): + batch_size = X.shape[0] + random_values = torch.rand(batch_size, device=self.device) + idx_to_change = random_values < self.p + + # ensure that first element to switch has probability > 0.5 + np_betas = np.random.beta(self.alpha, self.beta, batch_size) / 2 + 0.5 + random_betas = torch.from_numpy(np_betas).to(self.device).float() + index_permute = torch.randperm(batch_size, device=self.device) + + X[idx_to_change] = random_betas[idx_to_change, None] * X[idx_to_change] + X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view(X[idx_to_change].size()) # noqa + + y[idx_to_change] = random_betas[idx_to_change, None] * y[idx_to_change] + y[idx_to_change] += (1 - random_betas[idx_to_change, None]) * y[index_permute][idx_to_change].view(y[idx_to_change].size()) # noqa + + return X, y + + +class ClassificationSMOTE(): + """ + Apply SMOTE for classification tasks. + + This will average a percentage p of the elements in the batch with other elements. + The target will stay unchanged and keep the value of the most important row in the mix. + """ + def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0): + "" + self.seed = seed + self._set_seed() + self.device = define_device(device_name) + self.alpha = alpha + self.beta = beta + self.p = p + if (p < 0.) or (p > 1.0): + raise ValueError("Value of p should be between 0. and 1.") + + def _set_seed(self): + torch.manual_seed(self.seed) + np.random.seed(self.seed) + return + + def __call__(self, X, y): + batch_size = X.shape[0] + random_values = torch.rand(batch_size, device=self.device) + idx_to_change = random_values < self.p + + # ensure that first element to switch has probability > 0.5 + np_betas = np.random.beta(self.alpha, self.beta, batch_size) / 2 + 0.5 + random_betas = torch.from_numpy(np_betas).to(self.device).float() + index_permute = torch.randperm(batch_size, device=self.device) + + X[idx_to_change] = random_betas[idx_to_change, None] * X[idx_to_change] + X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view(X[idx_to_change].size()) # noqa + + return X, y diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py new file mode 100644 index 00000000..cb031d54 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py @@ -0,0 +1,287 @@ +import time +import datetime +import copy +import numpy as np +from dataclasses import dataclass, field +from typing import List, Any +import warnings + + +class Callback: + """ + Abstract base class used to build new callbacks. + """ + + def __init__(self): + pass + + def set_params(self, params): + self.params = params + + def set_trainer(self, model): + self.trainer = model + + def on_epoch_begin(self, epoch, logs=None): + pass + + def on_epoch_end(self, epoch, logs=None): + pass + + def on_batch_begin(self, batch, logs=None): + pass + + def on_batch_end(self, batch, logs=None): + pass + + def on_train_begin(self, logs=None): + pass + + def on_train_end(self, logs=None): + pass + + +@dataclass +class CallbackContainer: + """ + Container holding a list of callbacks. + """ + + callbacks: List[Callback] = field(default_factory=list) + + def append(self, callback): + self.callbacks.append(callback) + + def set_params(self, params): + for callback in self.callbacks: + callback.set_params(params) + + def set_trainer(self, trainer): + self.trainer = trainer + for callback in self.callbacks: + callback.set_trainer(trainer) + + def on_epoch_begin(self, epoch, logs=None): + logs = logs or {} + for callback in self.callbacks: + callback.on_epoch_begin(epoch, logs) + + def on_epoch_end(self, epoch, logs=None): + logs = logs or {} + for callback in self.callbacks: + callback.on_epoch_end(epoch, logs) + + def on_batch_begin(self, batch, logs=None): + logs = logs or {} + for callback in self.callbacks: + callback.on_batch_begin(batch, logs) + + def on_batch_end(self, batch, logs=None): + logs = logs or {} + for callback in self.callbacks: + callback.on_batch_end(batch, logs) + + def on_train_begin(self, logs=None): + logs = logs or {} + logs["start_time"] = time.time() + for callback in self.callbacks: + callback.on_train_begin(logs) + + def on_train_end(self, logs=None): + logs = logs or {} + for callback in self.callbacks: + callback.on_train_end(logs) + + +@dataclass +class EarlyStopping(Callback): + """EarlyStopping callback to exit the training loop if early_stopping_metric + does not improve by a certain amount for a certain + number of epochs. + + Parameters + --------- + early_stopping_metric : str + Early stopping metric name + is_maximize : bool + Whether to maximize or not early_stopping_metric + tol : float + minimum change in monitored value to qualify as improvement. + This number should be positive. + patience : integer + number of epochs to wait for improvement before terminating. + the counter be reset after each improvement + + """ + + early_stopping_metric: str + is_maximize: bool + tol: float = 0.0 + patience: int = 5 + + def __post_init__(self): + self.best_epoch = 0 + self.stopped_epoch = 0 + self.wait = 0 + self.best_weights = None + self.best_loss = np.inf + if self.is_maximize: + self.best_loss = -self.best_loss + super().__init__() + + def on_epoch_end(self, epoch, logs=None): + current_loss = logs.get(self.early_stopping_metric) + if current_loss is None: + return + + loss_change = current_loss - self.best_loss + max_improved = self.is_maximize and loss_change > self.tol + min_improved = (not self.is_maximize) and (-loss_change > self.tol) + if max_improved or min_improved: + self.best_loss = current_loss + self.best_epoch = epoch + self.wait = 1 + self.best_weights = copy.deepcopy(self.trainer.network.state_dict()) + else: + if self.wait >= self.patience: + self.stopped_epoch = epoch + self.trainer._stop_training = True + self.wait += 1 + + def on_train_end(self, logs=None): + self.trainer.best_epoch = self.best_epoch + self.trainer.best_cost = self.best_loss + + if self.best_weights is not None: + self.trainer.network.load_state_dict(self.best_weights) + + if self.stopped_epoch > 0: + msg = f"\nEarly stopping occurred at epoch {self.stopped_epoch}" + msg += ( + f" with best_epoch = {self.best_epoch} and " + + f"best_{self.early_stopping_metric} = {round(self.best_loss, 5)}" + ) + print(msg) + else: + msg = ( + f"Stop training because you reached max_epochs = {self.trainer.max_epochs}" + + f" with best_epoch = {self.best_epoch} and " + + f"best_{self.early_stopping_metric} = {round(self.best_loss, 5)}" + ) + print(msg) + wrn_msg = "Best weights from best epoch are automatically used!" + warnings.warn(wrn_msg) + + +@dataclass +class History(Callback): + """Callback that records events into a `History` object. + This callback is automatically applied to + every SuperModule. + + Parameters + --------- + trainer : DeepRecoModel + Model class to train + verbose : int + Print results every verbose iteration + + """ + + trainer: Any + verbose: int = 1 + + def __post_init__(self): + super().__init__() + self.samples_seen = 0.0 + self.total_time = 0.0 + + def on_train_begin(self, logs=None): + self.history = {"loss": []} + self.history.update({"lr": []}) + self.history.update({name: [] for name in self.trainer._metrics_names}) + self.start_time = logs["start_time"] + self.epoch_loss = 0.0 + + def on_epoch_begin(self, epoch, logs=None): + self.epoch_metrics = {"loss": 0.0} + self.samples_seen = 0.0 + + def on_epoch_end(self, epoch, logs=None): + self.epoch_metrics["loss"] = self.epoch_loss + for metric_name, metric_value in self.epoch_metrics.items(): + self.history[metric_name].append(metric_value) + if self.verbose == 0: + return + if epoch % self.verbose != 0: + return + msg = f"epoch {epoch:<3}" + for metric_name, metric_value in self.epoch_metrics.items(): + if metric_name != "lr": + msg += f"| {metric_name:<3}: {np.round(metric_value, 5):<8}" + self.total_time = int(time.time() - self.start_time) + msg += f"| {str(datetime.timedelta(seconds=self.total_time)) + 's':<6}" + print(msg) + + def on_batch_end(self, batch, logs=None): + batch_size = logs["batch_size"] + self.epoch_loss = ( + self.samples_seen * self.epoch_loss + batch_size * logs["loss"] + ) / (self.samples_seen + batch_size) + self.samples_seen += batch_size + + def __getitem__(self, name): + return self.history[name] + + def __repr__(self): + return str(self.history) + + def __str__(self): + return str(self.history) + + +@dataclass +class LRSchedulerCallback(Callback): + """Wrapper for most torch scheduler functions. + + Parameters + --------- + scheduler_fn : torch.optim.lr_scheduler + Torch scheduling class + scheduler_params : dict + Dictionnary containing all parameters for the scheduler_fn + is_batch_level : bool (default = False) + If set to False : lr updates will happen at every epoch + If set to True : lr updates happen at every batch + Set this to True for OneCycleLR for example + """ + + scheduler_fn: Any + optimizer: Any + scheduler_params: dict + early_stopping_metric: str + is_batch_level: bool = False + + def __post_init__( + self, + ): + self.is_metric_related = hasattr(self.scheduler_fn, "is_better") + self.scheduler = self.scheduler_fn(self.optimizer, **self.scheduler_params) + super().__init__() + + def on_batch_end(self, batch, logs=None): + if self.is_batch_level: + self.scheduler.step() + else: + pass + + def on_epoch_end(self, epoch, logs=None): + current_loss = logs.get(self.early_stopping_metric) + if current_loss is None: + return + if self.is_batch_level: + pass + else: + if self.is_metric_related: + self.scheduler.step(current_loss) + else: + self.scheduler.step() diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py new file mode 100644 index 00000000..e8ad8181 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py @@ -0,0 +1,523 @@ +from dataclasses import dataclass +from typing import List +import numpy as np +from sklearn.metrics import ( + roc_auc_score, + mean_squared_error, + mean_absolute_error, + accuracy_score, + log_loss, + balanced_accuracy_score, + mean_squared_log_error, +) +import torch + + +def UnsupervisedLoss(y_pred, embedded_x, obf_vars, eps=1e-9): + """ + Implements unsupervised loss function. + This differs from orginal paper as it's scaled to be batch size independent + and number of features reconstructed independent (by taking the mean) + + Parameters + ---------- + y_pred : torch.Tensor or np.array + Reconstructed prediction (with embeddings) + embedded_x : torch.Tensor + Original input embedded by network + obf_vars : torch.Tensor + Binary mask for obfuscated variables. + 1 means the variable was obfuscated so reconstruction is based on this. + eps : float + A small floating point to avoid ZeroDivisionError + This can happen in degenerated case when a feature has only one value + + Returns + ------- + loss : torch float + Unsupervised loss, average value over batch samples. + """ + errors = y_pred - embedded_x + reconstruction_errors = torch.mul(errors, obf_vars) ** 2 + batch_means = torch.mean(embedded_x, dim=0) + batch_means[batch_means == 0] = 1 + + batch_stds = torch.std(embedded_x, dim=0) ** 2 + batch_stds[batch_stds == 0] = batch_means[batch_stds == 0] + features_loss = torch.matmul(reconstruction_errors, 1 / batch_stds) + # compute the number of obfuscated variables to reconstruct + nb_reconstructed_variables = torch.sum(obf_vars, dim=1) + # take the mean of the reconstructed variable errors + features_loss = features_loss / (nb_reconstructed_variables + eps) + # here we take the mean per batch, contrary to the paper + loss = torch.mean(features_loss) + return loss + + +def UnsupervisedLossNumpy(y_pred, embedded_x, obf_vars, eps=1e-9): + errors = y_pred - embedded_x + reconstruction_errors = np.multiply(errors, obf_vars) ** 2 + batch_means = np.mean(embedded_x, axis=0) + batch_means = np.where(batch_means == 0, 1, batch_means) + + batch_stds = np.std(embedded_x, axis=0, ddof=1) ** 2 + batch_stds = np.where(batch_stds == 0, batch_means, batch_stds) + features_loss = np.matmul(reconstruction_errors, 1 / batch_stds) + # compute the number of obfuscated variables to reconstruct + nb_reconstructed_variables = np.sum(obf_vars, axis=1) + # take the mean of the reconstructed variable errors + features_loss = features_loss / (nb_reconstructed_variables + eps) + # here we take the mean per batch, contrary to the paper + loss = np.mean(features_loss) + return loss + + +@dataclass +class UnsupMetricContainer: + """Container holding a list of metrics. + + Parameters + ---------- + y_pred : torch.Tensor or np.array + Reconstructed prediction (with embeddings) + embedded_x : torch.Tensor + Original input embedded by network + obf_vars : torch.Tensor + Binary mask for obfuscated variables. + 1 means the variables was obfuscated so reconstruction is based on this. + + """ + + metric_names: List[str] + prefix: str = "" + + def __post_init__(self): + self.metrics = Metric.get_metrics_by_names(self.metric_names) + self.names = [self.prefix + name for name in self.metric_names] + + def __call__(self, y_pred, embedded_x, obf_vars): + """Compute all metrics and store into a dict. + + Parameters + ---------- + y_true : np.ndarray + Target matrix or vector + y_pred : np.ndarray + Score matrix or vector + + Returns + ------- + dict + Dict of metrics ({metric_name: metric_value}). + + """ + logs = {} + for metric in self.metrics: + res = metric(y_pred, embedded_x, obf_vars) + logs[self.prefix + metric._name] = res + return logs + + +@dataclass +class MetricContainer: + """Container holding a list of metrics. + + Parameters + ---------- + metric_names : list of str + List of metric names. + prefix : str + Prefix of metric names. + + """ + + metric_names: List[str] + prefix: str = "" + + def __post_init__(self): + self.metrics = Metric.get_metrics_by_names(self.metric_names) + self.names = [self.prefix + name for name in self.metric_names] + + def __call__(self, y_true, y_pred): + """Compute all metrics and store into a dict. + + Parameters + ---------- + y_true : np.ndarray + Target matrix or vector + y_pred : np.ndarray + Score matrix or vector + + Returns + ------- + dict + Dict of metrics ({metric_name: metric_value}). + + """ + logs = {} + for metric in self.metrics: + if isinstance(y_pred, list): + res = np.mean( + [metric(y_true[:, i], y_pred[i]) for i in range(len(y_pred))] + ) + else: + res = metric(y_true, y_pred) + logs[self.prefix + metric._name] = res + return logs + + +class Metric: + def __call__(self, y_true, y_pred): + raise NotImplementedError("Custom Metrics must implement this function") + + @classmethod + def get_metrics_by_names(cls, names): + """Get list of metric classes. + + Parameters + ---------- + cls : Metric + Metric class. + names : list + List of metric names. + + Returns + ------- + metrics : list + List of metric classes. + + """ + available_metrics = cls.__subclasses__() + available_names = [metric()._name for metric in available_metrics] + metrics = [] + for name in names: + assert ( + name in available_names + ), f"{name} is not available, choose in {available_names}" + idx = available_names.index(name) + metric = available_metrics[idx]() + metrics.append(metric) + return metrics + + +class AUC(Metric): + """ + AUC. + """ + + def __init__(self): + self._name = "auc" + self._maximize = True + + def __call__(self, y_true, y_score): + """ + Compute AUC of predictions. + + Parameters + ---------- + y_true : np.ndarray + Target matrix or vector + y_score : np.ndarray + Score matrix or vector + + Returns + ------- + float + AUC of predictions vs targets. + """ + return roc_auc_score(y_true, y_score[:, 1]) + + +class Accuracy(Metric): + """ + Accuracy. + """ + + def __init__(self): + self._name = "accuracy" + self._maximize = True + + def __call__(self, y_true, y_score): + """ + Compute Accuracy of predictions. + + Parameters + ---------- + y_true: np.ndarray + Target matrix or vector + y_score: np.ndarray + Score matrix or vector + + Returns + ------- + float + Accuracy of predictions vs targets. + """ + y_pred = np.argmax(y_score, axis=1) + return accuracy_score(y_true, y_pred) + + +class BalancedAccuracy(Metric): + """ + Balanced Accuracy. + """ + + def __init__(self): + self._name = "balanced_accuracy" + self._maximize = True + + def __call__(self, y_true, y_score): + """ + Compute Accuracy of predictions. + + Parameters + ---------- + y_true : np.ndarray + Target matrix or vector + y_score : np.ndarray + Score matrix or vector + + Returns + ------- + float + Accuracy of predictions vs targets. + """ + y_pred = np.argmax(y_score, axis=1) + return balanced_accuracy_score(y_true, y_pred) + + +class LogLoss(Metric): + """ + LogLoss. + """ + + def __init__(self): + self._name = "logloss" + self._maximize = False + + def __call__(self, y_true, y_score): + """ + Compute LogLoss of predictions. + + Parameters + ---------- + y_true : np.ndarray + Target matrix or vector + y_score : np.ndarray + Score matrix or vector + + Returns + ------- + float + LogLoss of predictions vs targets. + """ + return log_loss(y_true, y_score) + + +class MAE(Metric): + """ + Mean Absolute Error. + """ + + def __init__(self): + self._name = "mae" + self._maximize = False + + def __call__(self, y_true, y_score): + """ + Compute MAE (Mean Absolute Error) of predictions. + + Parameters + ---------- + y_true : np.ndarray + Target matrix or vector + y_score : np.ndarray + Score matrix or vector + + Returns + ------- + float + MAE of predictions vs targets. + """ + return mean_absolute_error(y_true, y_score) + + +class MSE(Metric): + """ + Mean Squared Error. + """ + + def __init__(self): + self._name = "mse" + self._maximize = False + + def __call__(self, y_true, y_score): + """ + Compute MSE (Mean Squared Error) of predictions. + + Parameters + ---------- + y_true : np.ndarray + Target matrix or vector + y_score : np.ndarray + Score matrix or vector + + Returns + ------- + float + MSE of predictions vs targets. + """ + return mean_squared_error(y_true, y_score) + + +class RMSLE(Metric): + """ + Root Mean squared logarithmic error regression loss. + Scikit-implementation: + https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html + Note: In order to avoid error, negative predictions are clipped to 0. + This means that you should clip negative predictions manually after calling predict. + """ + + def __init__(self): + self._name = "rmsle" + self._maximize = False + + def __call__(self, y_true, y_score): + """ + Compute RMSLE of predictions. + + Parameters + ---------- + y_true : np.ndarray + Target matrix or vector + y_score : np.ndarray + Score matrix or vector + + Returns + ------- + float + RMSLE of predictions vs targets. + """ + y_score = np.clip(y_score, a_min=0, a_max=None) + return np.sqrt(mean_squared_log_error(y_true, y_score)) + + +class UnsupervisedMetric(Metric): + """ + Unsupervised metric + """ + + def __init__(self): + self._name = "unsup_loss" + self._maximize = False + + def __call__(self, y_pred, embedded_x, obf_vars): + """ + Compute MSE (Mean Squared Error) of predictions. + + Parameters + ---------- + y_pred : torch.Tensor or np.array + Reconstructed prediction (with embeddings) + embedded_x : torch.Tensor + Original input embedded by network + obf_vars : torch.Tensor + Binary mask for obfuscated variables. + 1 means the variables was obfuscated so reconstruction is based on this. + + Returns + ------- + float + MSE of predictions vs targets. + """ + loss = UnsupervisedLoss(y_pred, embedded_x, obf_vars) + return loss.item() + + +class UnsupervisedNumpyMetric(Metric): + """ + Unsupervised metric + """ + + def __init__(self): + self._name = "unsup_loss_numpy" + self._maximize = False + + def __call__(self, y_pred, embedded_x, obf_vars): + """ + Compute MSE (Mean Squared Error) of predictions. + + Parameters + ---------- + y_pred : torch.Tensor or np.array + Reconstructed prediction (with embeddings) + embedded_x : torch.Tensor + Original input embedded by network + obf_vars : torch.Tensor + Binary mask for obfuscated variables. + 1 means the variables was obfuscated so reconstruction is based on this. + + Returns + ------- + float + MSE of predictions vs targets. + """ + return UnsupervisedLossNumpy( + y_pred, + embedded_x, + obf_vars + ) + + +class RMSE(Metric): + """ + Root Mean Squared Error. + """ + + def __init__(self): + self._name = "rmse" + self._maximize = False + + def __call__(self, y_true, y_score): + """ + Compute RMSE (Root Mean Squared Error) of predictions. + + Parameters + ---------- + y_true : np.ndarray + Target matrix or vector + y_score : np.ndarray + Score matrix or vector + + Returns + ------- + float + RMSE of predictions vs targets. + """ + return np.sqrt(mean_squared_error(y_true, y_score)) + + +def check_metrics(metrics): + """Check if custom metrics are provided. + + Parameters + ---------- + metrics : list of str or classes + List with built-in metrics (str) or custom metrics (classes). + + Returns + ------- + val_metrics : list of str + List of metric names. + + """ + val_metrics = [] + for metric in metrics: + if isinstance(metric, str): + val_metrics.append(metric) + elif issubclass(metric, Metric): + val_metrics.append(metric()._name) + else: + raise TypeError("You need to provide a valid metric format") + return val_metrics diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py new file mode 100644 index 00000000..8dbf08c5 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py @@ -0,0 +1,425 @@ +# Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi +# +# License: BSD 3 clause +""" +Multi-class / multi-label utility function +========================================== + +""" +from collections.abc import Sequence +from itertools import chain + +from scipy.sparse import issparse +from scipy.sparse.base import spmatrix +from scipy.sparse import dok_matrix +from scipy.sparse import lil_matrix +import scipy.sparse as sp + +import numpy as np +import pandas as pd + + +def _assert_all_finite(X, allow_nan=False): + """Like assert_all_finite, but only for ndarray.""" + + X = np.asanyarray(X) + # First try an O(n) time, O(1) space solution for the common case that + # everything is finite; fall back to O(n) space np.isfinite to prevent + # false positives from overflow in sum method. The sum is also calculated + # safely to reduce dtype induced overflows. + is_float = X.dtype.kind in "fc" + if is_float and (np.isfinite(np.sum(X))): + pass + elif is_float: + msg_err = "Input contains {} or a value too large for {!r}." + if ( + allow_nan + and np.isinf(X).any() + or not allow_nan + and not np.isfinite(X).all() + ): + type_err = "infinity" if allow_nan else "NaN, infinity" + raise ValueError(msg_err.format(type_err, X.dtype)) + # for object dtype data, we only check for NaNs (GH-13254) + elif X.dtype == np.dtype("object") and not allow_nan: + if np.isnan(X).any(): + raise ValueError("Input contains NaN") + + +def assert_all_finite(X, allow_nan=False): + """Throw a ValueError if X contains NaN or infinity. + + Parameters + ---------- + X : array or sparse matrix + allow_nan : bool + """ + _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan) + + +def _unique_multiclass(y): + if hasattr(y, "__array__"): + return np.unique(np.asarray(y)) + else: + return set(y) + + +def _unique_indicator(y): + """ + Not implemented + """ + raise IndexError( + f"""Given labels are of size {y.shape} while they should be (n_samples,) \n""" + + """If attempting multilabel classification, try using TabNetMultiTaskClassification """ + + """or TabNetRegressor""" + ) + + +_FN_UNIQUE_LABELS = { + "binary": _unique_multiclass, + "multiclass": _unique_multiclass, + "multilabel-indicator": _unique_indicator, +} + + +def unique_labels(*ys): + """Extract an ordered array of unique labels + + We don't allow: + - mix of multilabel and multiclass (single label) targets + - mix of label indicator matrix and anything else, + because there are no explicit labels) + - mix of label indicator matrices of different sizes + - mix of string and integer labels + + At the moment, we also don't allow "multiclass-multioutput" input type. + + Parameters + ---------- + *ys : array-likes + + Returns + ------- + out : numpy array of shape [n_unique_labels] + An ordered array of unique labels. + + Examples + -------- + >>> from sklearn.utils.multiclass import unique_labels + >>> unique_labels([3, 5, 5, 5, 7, 7]) + array([3, 5, 7]) + >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]) + array([1, 2, 3, 4]) + >>> unique_labels([1, 2, 10], [5, 11]) + array([ 1, 2, 5, 10, 11]) + """ + if not ys: + raise ValueError("No argument has been passed.") + # Check that we don't mix label format + + ys_types = set(type_of_target(x) for x in ys) + if ys_types == {"binary", "multiclass"}: + ys_types = {"multiclass"} + + if len(ys_types) > 1: + raise ValueError("Mix type of y not allowed, got types %s" % ys_types) + + label_type = ys_types.pop() + + # Get the unique set of labels + _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) + if not _unique_labels: + raise ValueError("Unknown label type: %s" % repr(ys)) + + ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys)) + + # Check that we don't mix string type with number type + if len(set(isinstance(label, str) for label in ys_labels)) > 1: + raise ValueError("Mix of label input types (string and number)") + + return np.array(sorted(ys_labels)) + + +def _is_integral_float(y): + return y.dtype.kind == "f" and np.all(y.astype(int) == y) + + +def is_multilabel(y): + """Check if ``y`` is in a multilabel format. + + Parameters + ---------- + y : numpy array of shape [n_samples] + Target values. + + Returns + ------- + out : bool + Return ``True``, if ``y`` is in a multilabel format, else ```False``. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.utils.multiclass import is_multilabel + >>> is_multilabel([0, 1, 0, 1]) + False + >>> is_multilabel([[1], [0, 2], []]) + False + >>> is_multilabel(np.array([[1, 0], [0, 0]])) + True + >>> is_multilabel(np.array([[1], [0], [0]])) + False + >>> is_multilabel(np.array([[1, 0, 0]])) + True + """ + if hasattr(y, "__array__"): + y = np.asarray(y) + if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1): + return False + + if issparse(y): + if isinstance(y, (dok_matrix, lil_matrix)): + y = y.tocsr() + return ( + len(y.data) == 0 + or np.unique(y.data).size == 1 + and ( + y.dtype.kind in "biu" + or _is_integral_float(np.unique(y.data)) # bool, int, uint + ) + ) + else: + labels = np.unique(y) + + return len(labels) < 3 and ( + y.dtype.kind in "biu" or _is_integral_float(labels) # bool, int, uint + ) + + +def check_classification_targets(y): + """Ensure that target y is of a non-regression type. + + Only the following target types (as defined in type_of_target) are allowed: + 'binary', 'multiclass', 'multiclass-multioutput', + 'multilabel-indicator', 'multilabel-sequences' + + Parameters + ---------- + y : array-like + """ + y_type = type_of_target(y) + if y_type not in [ + "binary", + "multiclass", + "multiclass-multioutput", + "multilabel-indicator", + "multilabel-sequences", + ]: + raise ValueError("Unknown label type: %r" % y_type) + + +def type_of_target(y): + """Determine the type of data indicated by the target. + + Note that this type is the most specific type that can be inferred. + For example: + + * ``binary`` is more specific but compatible with ``multiclass``. + * ``multiclass`` of integers is more specific but compatible with + ``continuous``. + * ``multilabel-indicator`` is more specific but compatible with + ``multiclass-multioutput``. + + Parameters + ---------- + y : array-like + + Returns + ------- + target_type : string + One of: + + * 'continuous': `y` is an array-like of floats that are not all + integers, and is 1d or a column vector. + * 'continuous-multioutput': `y` is a 2d array of floats that are + not all integers, and both dimensions are of size > 1. + * 'binary': `y` contains <= 2 discrete values and is 1d or a column + vector. + * 'multiclass': `y` contains more than two discrete values, is not a + sequence of sequences, and is 1d or a column vector. + * 'multiclass-multioutput': `y` is a 2d array that contains more + than two discrete values, is not a sequence of sequences, and both + dimensions are of size > 1. + * 'multilabel-indicator': `y` is a label indicator matrix, an array + of two dimensions with at least two columns, and at most 2 unique + values. + * 'unknown': `y` is array-like but none of the above, such as a 3d + array, sequence of sequences, or an array of non-sequence objects. + + Examples + -------- + >>> import numpy as np + >>> type_of_target([0.1, 0.6]) + 'continuous' + >>> type_of_target([1, -1, -1, 1]) + 'binary' + >>> type_of_target(['a', 'b', 'a']) + 'binary' + >>> type_of_target([1.0, 2.0]) + 'binary' + >>> type_of_target([1, 0, 2]) + 'multiclass' + >>> type_of_target([1.0, 0.0, 3.0]) + 'multiclass' + >>> type_of_target(['a', 'b', 'c']) + 'multiclass' + >>> type_of_target(np.array([[1, 2], [3, 1]])) + 'multiclass-multioutput' + >>> type_of_target([[1, 2]]) + 'multiclass-multioutput' + >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]])) + 'continuous-multioutput' + >>> type_of_target(np.array([[0, 1], [1, 1]])) + 'multilabel-indicator' + """ + valid = ( + isinstance(y, (Sequence, spmatrix)) or hasattr(y, "__array__") + ) and not isinstance(y, str) + + if not valid: + raise ValueError( + "Expected array-like (array or non-string sequence), " "got %r" % y + ) + + sparseseries = y.__class__.__name__ == "SparseSeries" + if sparseseries: + raise ValueError("y cannot be class 'SparseSeries'.") + + if is_multilabel(y): + return "multilabel-indicator" + + try: + y = np.asarray(y) + except ValueError: + # Known to fail in numpy 1.3 for array of arrays + return "unknown" + + # The old sequence of sequences format + try: + if ( + not hasattr(y[0], "__array__") + and isinstance(y[0], Sequence) + and not isinstance(y[0], str) + ): + raise ValueError( + "You appear to be using a legacy multi-label data" + " representation. Sequence of sequences are no" + " longer supported; use a binary array or sparse" + " matrix instead - the MultiLabelBinarizer" + " transformer can convert to this format." + ) + except IndexError: + pass + + # Invalid inputs + if y.ndim > 2 or (y.dtype == object and len(y) and not isinstance(y.flat[0], str)): + return "unknown" # [[[1, 2]]] or [obj_1] and not ["label_1"] + + if y.ndim == 2 and y.shape[1] == 0: + return "unknown" # [[]] + + if y.ndim == 2 and y.shape[1] > 1: + suffix = "-multioutput" # [[1, 2], [1, 2]] + else: + suffix = "" # [1, 2, 3] or [[1], [2], [3]] + + # check float and contains non-integer float values + if y.dtype.kind == "f" and np.any(y != y.astype(int)): + # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] + _assert_all_finite(y) + return "continuous" + suffix + + if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): + return "multiclass" + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] + else: + return "binary" # [1, 2] or [["a"], ["b"]] + + +def check_unique_type(y): + target_types = pd.Series(y).map(type).unique() + if len(target_types) != 1: + raise TypeError( + f"Values on the target must have the same type. Target has types {target_types}" + ) + + +def infer_output_dim(y_train): + """ + Infer output_dim from targets + + Parameters + ---------- + y_train : np.array + Training targets + + Returns + ------- + output_dim : int + Number of classes for output + train_labels : list + Sorted list of initial classes + """ + check_unique_type(y_train) + train_labels = unique_labels(y_train) + output_dim = len(train_labels) + + return output_dim, train_labels + + +def check_output_dim(labels, y): + if y is not None: + check_unique_type(y) + valid_labels = unique_labels(y) + if not set(valid_labels).issubset(set(labels)): + raise ValueError( + f"""Valid set -- {set(valid_labels)} -- + contains unkown targets from training -- + {set(labels)}""" + ) + return + + +def infer_multitask_output(y_train): + """ + Infer output_dim from targets + This is for multiple tasks. + + Parameters + ---------- + y_train : np.ndarray + Training targets + + Returns + ------- + tasks_dims : list + Number of classes for output + tasks_labels : list + List of sorted list of initial classes + """ + + if len(y_train.shape) < 2: + raise ValueError( + "y_train should be of shape (n_examples, n_tasks)" + + f"but got {y_train.shape}" + ) + nb_tasks = y_train.shape[1] + tasks_dims = [] + tasks_labels = [] + for task_idx in range(nb_tasks): + try: + output_dim, train_labels = infer_output_dim(y_train[:, task_idx]) + tasks_dims.append(output_dim) + tasks_labels.append(train_labels) + except ValueError as err: + raise ValueError(f"""Error for task {task_idx} : {err}""") + return tasks_dims, tasks_labels diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py new file mode 100644 index 00000000..da836203 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py @@ -0,0 +1,178 @@ +import torch +import numpy as np +from scipy.special import softmax +from pytorch_tabnet.utils import SparsePredictDataset, PredictDataset, filter_weights +from pytorch_tabnet.abstract_model import TabModel +from pytorch_tabnet.multiclass_utils import infer_multitask_output, check_output_dim +from torch.utils.data import DataLoader +import scipy + + +class TabNetMultiTaskClassifier(TabModel): + def __post_init__(self): + super(TabNetMultiTaskClassifier, self).__post_init__() + self._task = 'classification' + self._default_loss = torch.nn.functional.cross_entropy + self._default_metric = 'logloss' + + def prepare_target(self, y): + y_mapped = y.copy() + for task_idx in range(y.shape[1]): + task_mapper = self.target_mapper[task_idx] + y_mapped[:, task_idx] = np.vectorize(task_mapper.get)(y[:, task_idx]) + return y_mapped + + def compute_loss(self, y_pred, y_true): + """ + Computes the loss according to network output and targets + + Parameters + ---------- + y_pred : list of tensors + Output of network + y_true : LongTensor + Targets label encoded + + Returns + ------- + loss : torch.Tensor + output of loss function(s) + + """ + loss = 0 + y_true = y_true.long() + if isinstance(self.loss_fn, list): + # if you specify a different loss for each task + for task_loss, task_output, task_id in zip( + self.loss_fn, y_pred, range(len(self.loss_fn)) + ): + loss += task_loss(task_output, y_true[:, task_id]) + else: + # same loss function is applied to all tasks + for task_id, task_output in enumerate(y_pred): + loss += self.loss_fn(task_output, y_true[:, task_id]) + + loss /= len(y_pred) + return loss + + def stack_batches(self, list_y_true, list_y_score): + y_true = np.vstack(list_y_true) + y_score = [] + for i in range(len(self.output_dim)): + score = np.vstack([x[i] for x in list_y_score]) + score = softmax(score, axis=1) + y_score.append(score) + return y_true, y_score + + def update_fit_params(self, X_train, y_train, eval_set, weights): + output_dim, train_labels = infer_multitask_output(y_train) + for _, y in eval_set: + for task_idx in range(y.shape[1]): + check_output_dim(train_labels[task_idx], y[:, task_idx]) + self.output_dim = output_dim + self.classes_ = train_labels + self.target_mapper = [ + {class_label: index for index, class_label in enumerate(classes)} + for classes in self.classes_ + ] + self.preds_mapper = [ + {str(index): str(class_label) for index, class_label in enumerate(classes)} + for classes in self.classes_ + ] + self.updated_weights = weights + filter_weights(self.updated_weights) + + def predict(self, X): + """ + Make predictions on a batch (valid) + + Parameters + ---------- + X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` + Input data + + Returns + ------- + results : np.array + Predictions of the most probable class + """ + self.network.eval() + + if scipy.sparse.issparse(X): + dataloader = DataLoader( + SparsePredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + else: + dataloader = DataLoader( + PredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + + results = {} + for data in dataloader: + data = data.to(self.device).float() + output, _ = self.network(data) + predictions = [ + torch.argmax(torch.nn.Softmax(dim=1)(task_output), dim=1) + .cpu() + .detach() + .numpy() + .reshape(-1) + for task_output in output + ] + + for task_idx in range(len(self.output_dim)): + results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]] + # stack all task individually + results = [np.hstack(task_res) for task_res in results.values()] + # map all task individually + results = [ + np.vectorize(self.preds_mapper[task_idx].get)(task_res.astype(str)) + for task_idx, task_res in enumerate(results) + ] + return results + + def predict_proba(self, X): + """ + Make predictions for classification on a batch (valid) + + Parameters + ---------- + X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` + Input data + + Returns + ------- + res : list of np.ndarray + + """ + self.network.eval() + + if scipy.sparse.issparse(X): + dataloader = DataLoader( + SparsePredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + else: + dataloader = DataLoader( + PredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + + results = {} + for data in dataloader: + data = data.to(self.device).float() + output, _ = self.network(data) + predictions = [ + torch.nn.Softmax(dim=1)(task_output).cpu().detach().numpy() + for task_output in output + ] + for task_idx in range(len(self.output_dim)): + results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]] + res = [np.vstack(task_res) for task_res in results.values()] + return res diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py new file mode 100644 index 00000000..87de306d --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py @@ -0,0 +1,428 @@ +import torch +import numpy as np +from torch.utils.data import DataLoader +from pytorch_tabnet import tab_network +from pytorch_tabnet.utils import ( + create_explain_matrix, + filter_weights, + SparsePredictDataset, + PredictDataset, + check_input, + create_group_matrix, +) +from torch.nn.utils import clip_grad_norm_ +from pytorch_tabnet.pretraining_utils import ( + create_dataloaders, + validate_eval_set, +) +from pytorch_tabnet.metrics import ( + UnsupMetricContainer, + check_metrics, + UnsupervisedLoss, +) +from pytorch_tabnet.abstract_model import TabModel +import scipy + + +class TabNetPretrainer(TabModel): + def __post_init__(self): + super(TabNetPretrainer, self).__post_init__() + self._task = 'unsupervised' + self._default_loss = UnsupervisedLoss + self._default_metric = 'unsup_loss_numpy' + + def prepare_target(self, y): + return y + + def compute_loss(self, output, embedded_x, obf_vars): + return self.loss_fn(output, embedded_x, obf_vars) + + def update_fit_params( + self, + weights, + ): + self.updated_weights = weights + filter_weights(self.updated_weights) + self.preds_mapper = None + + def fit( + self, + X_train, + eval_set=None, + eval_name=None, + loss_fn=None, + pretraining_ratio=0.5, + weights=0, + max_epochs=100, + patience=10, + batch_size=1024, + virtual_batch_size=128, + num_workers=0, + drop_last=True, + callbacks=None, + pin_memory=True, + warm_start=False + ): + """Train a neural network stored in self.network + Using train_dataloader for training data and + valid_dataloader for validation. + + Parameters + ---------- + X_train : np.ndarray + Train set to reconstruct in self supervision + eval_set : list of np.array + List of evaluation set + The last one is used for early stopping + eval_name : list of str + List of eval set names. + eval_metric : list of str + List of evaluation metrics. + The last metric is used for early stopping. + loss_fn : callable or None + a PyTorch loss function + should be left to None for self supervised and non experts + pretraining_ratio : float + Between 0 and 1, percentage of feature to mask for reconstruction + weights : np.array + Sampling weights for each example. + max_epochs : int + Maximum number of epochs during training + patience : int + Number of consecutive non improving epoch before early stopping + batch_size : int + Training batch size + virtual_batch_size : int + Batch size for Ghost Batch Normalization (virtual_batch_size < batch_size) + num_workers : int + Number of workers used in torch.utils.data.DataLoader + drop_last : bool + Whether to drop last batch during training + callbacks : list of callback function + List of custom callbacks + pin_memory: bool + Whether to set pin_memory to True or False during training + """ + # update model name + + self.max_epochs = max_epochs + self.patience = patience + self.batch_size = batch_size + self.virtual_batch_size = virtual_batch_size + self.num_workers = num_workers + self.drop_last = drop_last + self.input_dim = X_train.shape[1] + self._stop_training = False + self.pin_memory = pin_memory and (self.device.type != "cpu") + self.pretraining_ratio = pretraining_ratio + eval_set = eval_set if eval_set else [] + + if loss_fn is None: + self.loss_fn = self._default_loss + else: + self.loss_fn = loss_fn + + check_input(X_train) + + self.update_fit_params( + weights, + ) + + # Validate and reformat eval set depending on training data + eval_names = validate_eval_set(eval_set, eval_name, X_train) + train_dataloader, valid_dataloaders = self._construct_loaders( + X_train, eval_set + ) + + if not hasattr(self, "network") or not warm_start: + # model has never been fitted before of warm_start is False + self._set_network() + + self._update_network_params() + self._set_metrics(eval_names) + self._set_optimizer() + self._set_callbacks(callbacks) + + # Call method on_train_begin for all callbacks + self._callback_container.on_train_begin() + + # Training loop over epochs + for epoch_idx in range(self.max_epochs): + + # Call method on_epoch_begin for all callbacks + self._callback_container.on_epoch_begin(epoch_idx) + + self._train_epoch(train_dataloader) + + # Apply predict epoch to all eval sets + for eval_name, valid_dataloader in zip(eval_names, valid_dataloaders): + self._predict_epoch(eval_name, valid_dataloader) + + # Call method on_epoch_end for all callbacks + self._callback_container.on_epoch_end( + epoch_idx, logs=self.history.epoch_metrics + ) + + if self._stop_training: + break + + # Call method on_train_end for all callbacks + self._callback_container.on_train_end() + self.network.eval() + + def _set_network(self): + """Setup the network and explain matrix.""" + if not hasattr(self, 'pretraining_ratio'): + self.pretraining_ratio = 0.5 + torch.manual_seed(self.seed) + + self.group_matrix = create_group_matrix(self.grouped_features, self.input_dim) + + self.network = tab_network.TabNetPretraining( + self.input_dim, + pretraining_ratio=self.pretraining_ratio, + n_d=self.n_d, + n_a=self.n_a, + n_steps=self.n_steps, + gamma=self.gamma, + cat_idxs=self.cat_idxs, + cat_dims=self.cat_dims, + cat_emb_dim=self.cat_emb_dim, + n_independent=self.n_independent, + n_shared=self.n_shared, + n_shared_decoder=self.n_shared_decoder, + n_indep_decoder=self.n_indep_decoder, + epsilon=self.epsilon, + virtual_batch_size=self.virtual_batch_size, + momentum=self.momentum, + mask_type=self.mask_type, + group_attention_matrix=self.group_matrix.to(self.device), + ).to(self.device) + + self.reducing_matrix = create_explain_matrix( + self.network.input_dim, + self.network.cat_emb_dim, + self.network.cat_idxs, + self.network.post_embed_dim, + ) + + def _update_network_params(self): + self.network.virtual_batch_size = self.virtual_batch_size + self.network.pretraining_ratio = self.pretraining_ratio + + def _set_metrics(self, eval_names): + """Set attributes relative to the metrics. + + Parameters + ---------- + metrics : list of str + List of eval metric names. + eval_names : list of str + List of eval set names. + + """ + metrics = [self._default_metric] + + metrics = check_metrics(metrics) + # Set metric container for each sets + self._metric_container_dict = {} + for name in eval_names: + self._metric_container_dict.update( + {name: UnsupMetricContainer(metrics, prefix=f"{name}_")} + ) + + self._metrics = [] + self._metrics_names = [] + for _, metric_container in self._metric_container_dict.items(): + self._metrics.extend(metric_container.metrics) + self._metrics_names.extend(metric_container.names) + + # Early stopping metric is the last eval metric + self.early_stopping_metric = ( + self._metrics_names[-1] if len(self._metrics_names) > 0 else None + ) + + def _construct_loaders(self, X_train, eval_set): + """Generate dataloaders for unsupervised train and eval set. + + Parameters + ---------- + X_train : np.array + Train set. + eval_set : list of tuple + List of eval tuple set (X, y). + + Returns + ------- + train_dataloader : `torch.utils.data.Dataloader` + Training dataloader. + valid_dataloaders : list of `torch.utils.data.Dataloader` + List of validation dataloaders. + + """ + train_dataloader, valid_dataloaders = create_dataloaders( + X_train, + eval_set, + self.updated_weights, + self.batch_size, + self.num_workers, + self.drop_last, + self.pin_memory, + ) + return train_dataloader, valid_dataloaders + + def _train_epoch(self, train_loader): + """ + Trains one epoch of the network in self.network + + Parameters + ---------- + train_loader : a :class: `torch.utils.data.Dataloader` + DataLoader with train set + """ + self.network.train() + + for batch_idx, X in enumerate(train_loader): + self._callback_container.on_batch_begin(batch_idx) + + batch_logs = self._train_batch(X) + + self._callback_container.on_batch_end(batch_idx, batch_logs) + + epoch_logs = {"lr": self._optimizer.param_groups[-1]["lr"]} + self.history.epoch_metrics.update(epoch_logs) + + return + + def _train_batch(self, X): + """ + Trains one batch of data + + Parameters + ---------- + X : torch.Tensor + Train matrix + + Returns + ------- + batch_outs : dict + Dictionnary with "y": target and "score": prediction scores. + batch_logs : dict + Dictionnary with "batch_size" and "loss". + """ + batch_logs = {"batch_size": X.shape[0]} + + X = X.to(self.device).float() + + for param in self.network.parameters(): + param.grad = None + + output, embedded_x, obf_vars = self.network(X) + loss = self.compute_loss(output, embedded_x, obf_vars) + + # Perform backward pass and optimization + loss.backward() + if self.clip_value: + clip_grad_norm_(self.network.parameters(), self.clip_value) + self._optimizer.step() + + batch_logs["loss"] = loss.cpu().detach().numpy().item() + + return batch_logs + + def _predict_epoch(self, name, loader): + """ + Predict an epoch and update metrics. + + Parameters + ---------- + name : str + Name of the validation set + loader : torch.utils.data.Dataloader + DataLoader with validation set + """ + # Setting network on evaluation mode + self.network.eval() + + list_output = [] + list_embedded_x = [] + list_obfuscation = [] + # Main loop + for batch_idx, X in enumerate(loader): + output, embedded_x, obf_vars = self._predict_batch(X) + list_output.append(output.cpu().detach().numpy()) + list_embedded_x.append(embedded_x.cpu().detach().numpy()) + list_obfuscation.append(obf_vars.cpu().detach().numpy()) + + output, embedded_x, obf_vars = self.stack_batches(list_output, + list_embedded_x, + list_obfuscation) + + metrics_logs = self._metric_container_dict[name](output, embedded_x, obf_vars) + self.network.train() + self.history.epoch_metrics.update(metrics_logs) + return + + def _predict_batch(self, X): + """ + Predict one batch of data. + + Parameters + ---------- + X : torch.Tensor + Owned products + + Returns + ------- + np.array + model scores + """ + X = X.to(self.device).float() + return self.network(X) + + def stack_batches(self, list_output, list_embedded_x, list_obfuscation): + output = np.vstack(list_output) + embedded_x = np.vstack(list_embedded_x) + obf_vars = np.vstack(list_obfuscation) + return output, embedded_x, obf_vars + + def predict(self, X): + """ + Make predictions on a batch (valid) + + Parameters + ---------- + X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` + Input data + + Returns + ------- + predictions : np.array + Predictions of the regression problem + """ + self.network.eval() + + if scipy.sparse.issparse(X): + dataloader = DataLoader( + SparsePredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + else: + dataloader = DataLoader( + PredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + + results = [] + embedded_res = [] + for batch_nb, data in enumerate(dataloader): + data = data.to(self.device).float() + output, embeded_x, _ = self.network(data) + predictions = output.cpu().detach().numpy() + results.append(predictions) + embedded_res.append(embeded_x.cpu().detach().numpy()) + res_output = np.vstack(results) + embedded_inputs = np.vstack(embedded_res) + return res_output, embedded_inputs diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py new file mode 100644 index 00000000..0874be95 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py @@ -0,0 +1,128 @@ +from torch.utils.data import DataLoader +from pytorch_tabnet.utils import ( + create_sampler, + SparsePredictDataset, + PredictDataset, + check_input +) +import scipy + + +def create_dataloaders( + X_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory +): + """ + Create dataloaders with or without subsampling depending on weights and balanced. + + Parameters + ---------- + X_train : np.ndarray or scipy.sparse.csr_matrix + Training data + eval_set : list of np.array (for Xs and ys) or scipy.sparse.csr_matrix (for Xs) + List of eval sets + weights : either 0, 1, dict or iterable + if 0 (default) : no weights will be applied + if 1 : classification only, will balanced class with inverse frequency + if dict : keys are corresponding class values are sample weights + if iterable : list or np array must be of length equal to nb elements + in the training set + batch_size : int + how many samples per batch to load + num_workers : int + how many subprocesses to use for data loading. 0 means that the data + will be loaded in the main process + drop_last : bool + set to True to drop the last incomplete batch, if the dataset size is not + divisible by the batch size. If False and the size of dataset is not + divisible by the batch size, then the last batch will be smaller + pin_memory : bool + Whether to pin GPU memory during training + + Returns + ------- + train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader + Training and validation dataloaders + """ + need_shuffle, sampler = create_sampler(weights, X_train) + + if scipy.sparse.issparse(X_train): + train_dataloader = DataLoader( + SparsePredictDataset(X_train), + batch_size=batch_size, + sampler=sampler, + shuffle=need_shuffle, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=pin_memory, + ) + else: + train_dataloader = DataLoader( + PredictDataset(X_train), + batch_size=batch_size, + sampler=sampler, + shuffle=need_shuffle, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=pin_memory, + ) + + valid_dataloaders = [] + for X in eval_set: + if scipy.sparse.issparse(X): + valid_dataloaders.append( + DataLoader( + SparsePredictDataset(X), + batch_size=batch_size, + sampler=sampler, + shuffle=need_shuffle, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=pin_memory, + ) + ) + else: + valid_dataloaders.append( + DataLoader( + PredictDataset(X), + batch_size=batch_size, + sampler=sampler, + shuffle=need_shuffle, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=pin_memory, + ) + ) + + return train_dataloader, valid_dataloaders + + +def validate_eval_set(eval_set, eval_name, X_train): + """Check if the shapes of eval_set are compatible with X_train. + + Parameters + ---------- + eval_set : List of numpy array + The list evaluation set. + The last one is used for early stopping + X_train : np.ndarray + Train owned products + + Returns + ------- + eval_names : list of str + Validated list of eval_names. + + """ + eval_names = eval_name or [f"val_{i}" for i in range(len(eval_set))] + assert len(eval_set) == len( + eval_names + ), "eval_set and eval_name have not the same length" + + for set_nb, X in enumerate(eval_set): + check_input(X) + msg = ( + f"Number of columns is different between eval set {set_nb}" + + f"({X.shape[1]}) and X_train ({X_train.shape[1]})" + ) + assert X.shape[1] == X_train.shape[1], msg + return eval_names diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py new file mode 100644 index 00000000..9862efa4 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py @@ -0,0 +1,278 @@ +from torch import nn +from torch.autograd import Function +import torch.nn.functional as F + +import torch + +""" +Other possible implementations: +https://github.com/KrisKorrel/sparsemax-pytorch/blob/master/sparsemax.py +https://github.com/msobroza/SparsemaxPytorch/blob/master/mnist/sparsemax.py +https://github.com/vene/sparse-structured-attention/blob/master/pytorch/torchsparseattn/sparsemax.py +""" + + +# credits to Yandex https://github.com/Qwicen/node/blob/master/lib/nn_utils.py +def _make_ix_like(input, dim=0): + d = input.size(dim) + rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) + view = [1] * input.dim() + view[0] = -1 + return rho.view(view).transpose(0, dim) + + +class SparsemaxFunction(Function): + """ + An implementation of sparsemax (Martins & Astudillo, 2016). See + :cite:`DBLP:journals/corr/MartinsA16` for detailed description. + By Ben Peters and Vlad Niculae + """ + + @staticmethod + def forward(ctx, input, dim=-1): + """sparsemax: normalizing sparse transform (a la softmax) + + Parameters + ---------- + ctx : torch.autograd.function._ContextMethodMixin + input : torch.Tensor + any shape + dim : int + dimension along which to apply sparsemax + + Returns + ------- + output : torch.Tensor + same shape as input + + """ + ctx.dim = dim + max_val, _ = input.max(dim=dim, keepdim=True) + input -= max_val # same numerical stability trick as for softmax + tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim) + output = torch.clamp(input - tau, min=0) + ctx.save_for_backward(supp_size, output) + return output + + @staticmethod + def backward(ctx, grad_output): + supp_size, output = ctx.saved_tensors + dim = ctx.dim + grad_input = grad_output.clone() + grad_input[output == 0] = 0 + + v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() + v_hat = v_hat.unsqueeze(dim) + grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) + return grad_input, None + + @staticmethod + def _threshold_and_support(input, dim=-1): + """Sparsemax building block: compute the threshold + + Parameters + ---------- + input: torch.Tensor + any dimension + dim : int + dimension along which to apply the sparsemax + + Returns + ------- + tau : torch.Tensor + the threshold value + support_size : torch.Tensor + + """ + + input_srt, _ = torch.sort(input, descending=True, dim=dim) + input_cumsum = input_srt.cumsum(dim) - 1 + rhos = _make_ix_like(input, dim) + support = rhos * input_srt > input_cumsum + + support_size = support.sum(dim=dim).unsqueeze(dim) + tau = input_cumsum.gather(dim, support_size - 1) + tau /= support_size.to(input.dtype) + return tau, support_size + + +sparsemax = SparsemaxFunction.apply + + +class Sparsemax(nn.Module): + + def __init__(self, dim=-1): + self.dim = dim + super(Sparsemax, self).__init__() + + def forward(self, input): + return sparsemax(input, self.dim) + + +class Entmax15Function(Function): + """ + An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). See + :cite:`https://arxiv.org/abs/1905.05702 for detailed description. + Source: https://github.com/deep-spin/entmax + """ + + @staticmethod + def forward(ctx, input, dim=-1): + ctx.dim = dim + + max_val, _ = input.max(dim=dim, keepdim=True) + input = input - max_val # same numerical stability trick as for softmax + input = input / 2 # divide by 2 to solve actual Entmax + + tau_star, _ = Entmax15Function._threshold_and_support(input, dim) + output = torch.clamp(input - tau_star, min=0) ** 2 + ctx.save_for_backward(output) + return output + + @staticmethod + def backward(ctx, grad_output): + Y, = ctx.saved_tensors + gppr = Y.sqrt() # = 1 / g'' (Y) + dX = grad_output * gppr + q = dX.sum(ctx.dim) / gppr.sum(ctx.dim) + q = q.unsqueeze(ctx.dim) + dX -= q * gppr + return dX, None + + @staticmethod + def _threshold_and_support(input, dim=-1): + Xsrt, _ = torch.sort(input, descending=True, dim=dim) + + rho = _make_ix_like(input, dim) + mean = Xsrt.cumsum(dim) / rho + mean_sq = (Xsrt ** 2).cumsum(dim) / rho + ss = rho * (mean_sq - mean ** 2) + delta = (1 - ss) / rho + + # NOTE this is not exactly the same as in reference algo + # Fortunately it seems the clamped values never wrongly + # get selected by tau <= sorted_z. Prove this! + delta_nz = torch.clamp(delta, 0) + tau = mean - torch.sqrt(delta_nz) + + support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim) + tau_star = tau.gather(dim, support_size - 1) + return tau_star, support_size + + +class Entmoid15(Function): + """ A highly optimized equivalent of lambda x: Entmax15([x, 0]) """ + + @staticmethod + def forward(ctx, input): + output = Entmoid15._forward(input) + ctx.save_for_backward(output) + return output + + @staticmethod + def _forward(input): + input, is_pos = abs(input), input >= 0 + tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2 + tau.masked_fill_(tau <= input, 2.0) + y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2 + return torch.where(is_pos, 1 - y_neg, y_neg) + + @staticmethod + def backward(ctx, grad_output): + return Entmoid15._backward(ctx.saved_tensors[0], grad_output) + + @staticmethod + def _backward(output, grad_output): + gppr0, gppr1 = output.sqrt(), (1 - output).sqrt() + grad_input = grad_output * gppr0 + q = grad_input / (gppr0 + gppr1) + grad_input -= q * gppr0 + return grad_input + + +entmax15 = Entmax15Function.apply +entmoid15 = Entmoid15.apply + + +class Entmax15(nn.Module): + + def __init__(self, dim=-1): + self.dim = dim + super(Entmax15, self).__init__() + + def forward(self, input): + return entmax15(input, self.dim) + + +# Credits were lost... +# def _make_ix_like(input, dim=0): +# d = input.size(dim) +# rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) +# view = [1] * input.dim() +# view[0] = -1 +# return rho.view(view).transpose(0, dim) +# +# +# def _threshold_and_support(input, dim=0): +# """Sparsemax building block: compute the threshold +# Args: +# input: any dimension +# dim: dimension along which to apply the sparsemax +# Returns: +# the threshold value +# """ +# +# input_srt, _ = torch.sort(input, descending=True, dim=dim) +# input_cumsum = input_srt.cumsum(dim) - 1 +# rhos = _make_ix_like(input, dim) +# support = rhos * input_srt > input_cumsum +# +# support_size = support.sum(dim=dim).unsqueeze(dim) +# tau = input_cumsum.gather(dim, support_size - 1) +# tau /= support_size.to(input.dtype) +# return tau, support_size +# +# +# class SparsemaxFunction(Function): +# +# @staticmethod +# def forward(ctx, input, dim=0): +# """sparsemax: normalizing sparse transform (a la softmax) +# Parameters: +# input (Tensor): any shape +# dim: dimension along which to apply sparsemax +# Returns: +# output (Tensor): same shape as input +# """ +# ctx.dim = dim +# max_val, _ = input.max(dim=dim, keepdim=True) +# input -= max_val # same numerical stability trick as for softmax +# tau, supp_size = _threshold_and_support(input, dim=dim) +# output = torch.clamp(input - tau, min=0) +# ctx.save_for_backward(supp_size, output) +# return output +# +# @staticmethod +# def backward(ctx, grad_output): +# supp_size, output = ctx.saved_tensors +# dim = ctx.dim +# grad_input = grad_output.clone() +# grad_input[output == 0] = 0 +# +# v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() +# v_hat = v_hat.unsqueeze(dim) +# grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) +# return grad_input, None +# +# +# sparsemax = SparsemaxFunction.apply +# +# +# class Sparsemax(nn.Module): +# +# def __init__(self, dim=0): +# self.dim = dim +# super(Sparsemax, self).__init__() +# +# def forward(self, input): +# return sparsemax(input, self.dim) diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py new file mode 100755 index 00000000..ff01991c --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py @@ -0,0 +1,154 @@ +import torch +import numpy as np +from scipy.special import softmax +from pytorch_tabnet.utils import SparsePredictDataset, PredictDataset, filter_weights +from pytorch_tabnet.abstract_model import TabModel +from pytorch_tabnet.multiclass_utils import infer_output_dim, check_output_dim +from torch.utils.data import DataLoader +import scipy + + +class TabNetClassifier(TabModel): + def __post_init__(self): + super(TabNetClassifier, self).__post_init__() + self._task = 'classification' + self._default_loss = torch.nn.functional.cross_entropy + self._default_metric = 'accuracy' + + def weight_updater(self, weights): + """ + Updates weights dictionary according to target_mapper. + + Parameters + ---------- + weights : bool or dict + Given weights for balancing training. + + Returns + ------- + bool or dict + Same bool if weights are bool, updated dict otherwise. + + """ + if isinstance(weights, int): + return weights + elif isinstance(weights, dict): + return {self.target_mapper[key]: value for key, value in weights.items()} + else: + return weights + + def prepare_target(self, y): + return np.vectorize(self.target_mapper.get)(y) + + def compute_loss(self, y_pred, y_true): + return self.loss_fn(y_pred, y_true.long()) + + def update_fit_params( + self, + X_train, + y_train, + eval_set, + weights, + ): + output_dim, train_labels = infer_output_dim(y_train) + for X, y in eval_set: + check_output_dim(train_labels, y) + self.output_dim = output_dim + self._default_metric = ('auc' if self.output_dim == 2 else 'accuracy') + self.classes_ = train_labels + self.target_mapper = { + class_label: index for index, class_label in enumerate(self.classes_) + } + self.preds_mapper = { + str(index): class_label for index, class_label in enumerate(self.classes_) + } + self.updated_weights = self.weight_updater(weights) + + def stack_batches(self, list_y_true, list_y_score): + y_true = np.hstack(list_y_true) + y_score = np.vstack(list_y_score) + y_score = softmax(y_score, axis=1) + return y_true, y_score + + def predict_func(self, outputs): + outputs = np.argmax(outputs, axis=1) + return np.vectorize(self.preds_mapper.get)(outputs.astype(str)) + + def predict_proba(self, X): + """ + Make predictions for classification on a batch (valid) + + Parameters + ---------- + X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` + Input data + + Returns + ------- + res : np.ndarray + + """ + self.network.eval() + + if scipy.sparse.issparse(X): + dataloader = DataLoader( + SparsePredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + else: + dataloader = DataLoader( + PredictDataset(X), + batch_size=self.batch_size, + shuffle=False, + ) + + results = [] + for batch_nb, data in enumerate(dataloader): + data = data.to(self.device).float() + + output, M_loss = self.network(data) + predictions = torch.nn.Softmax(dim=1)(output).cpu().detach().numpy() + results.append(predictions) + res = np.vstack(results) + return res + + +class TabNetRegressor(TabModel): + def __post_init__(self): + super(TabNetRegressor, self).__post_init__() + self._task = 'regression' + self._default_loss = torch.nn.functional.mse_loss + self._default_metric = 'mse' + + def prepare_target(self, y): + return y + + def compute_loss(self, y_pred, y_true): + return self.loss_fn(y_pred, y_true) + + def update_fit_params( + self, + X_train, + y_train, + eval_set, + weights + ): + if len(y_train.shape) != 2: + msg = "Targets should be 2D : (n_samples, n_regression) " + \ + f"but y_train.shape={y_train.shape} given.\n" + \ + "Use reshape(-1, 1) for single regression." + raise ValueError(msg) + self.output_dim = y_train.shape[1] + self.preds_mapper = None + + self.updated_weights = weights + filter_weights(self.updated_weights) + + def predict_func(self, outputs): + return outputs + + def stack_batches(self, list_y_true, list_y_score): + y_true = np.vstack(list_y_true) + y_score = np.vstack(list_y_score) + return y_true, y_score diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py new file mode 100644 index 00000000..95c2bae2 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py @@ -0,0 +1,934 @@ +import torch +from torch.nn import Linear, BatchNorm1d, ReLU +import numpy as np +from pytorch_tabnet import sparsemax + + +def initialize_non_glu(module, input_dim, output_dim): + gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(4 * input_dim)) + torch.nn.init.xavier_normal_(module.weight, gain=gain_value) + # torch.nn.init.zeros_(module.bias) + return + + +def initialize_glu(module, input_dim, output_dim): + gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(input_dim)) + torch.nn.init.xavier_normal_(module.weight, gain=gain_value) + # torch.nn.init.zeros_(module.bias) + return + + +class GBN(torch.nn.Module): + """ + Ghost Batch Normalization + https://arxiv.org/abs/1705.08741 + """ + + def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01): + super(GBN, self).__init__() + + self.input_dim = input_dim + self.virtual_batch_size = virtual_batch_size + self.bn = BatchNorm1d(self.input_dim, momentum=momentum) + + def forward(self, x): + chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0) + res = [self.bn(x_) for x_ in chunks] + + return torch.cat(res, dim=0) + + +class TabNetEncoder(torch.nn.Module): + def __init__( + self, + input_dim, + output_dim, + n_d=8, + n_a=8, + n_steps=3, + gamma=1.3, + n_independent=2, + n_shared=2, + epsilon=1e-15, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + group_attention_matrix=None, + ): + """ + Defines main part of the TabNet network without the embedding layers. + + Parameters + ---------- + input_dim : int + Number of features + output_dim : int or list of int for multi task classification + Dimension of network output + examples : one for regression, 2 for binary classification etc... + n_d : int + Dimension of the prediction layer (usually between 4 and 64) + n_a : int + Dimension of the attention layer (usually between 4 and 64) + n_steps : int + Number of successive steps in the network (usually between 3 and 10) + gamma : float + Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0) + n_independent : int + Number of independent GLU layer in each GLU block (default 2) + n_shared : int + Number of independent GLU layer in each GLU block (default 2) + epsilon : float + Avoid log(0), this should be kept very low + virtual_batch_size : int + Batch size for Ghost Batch Normalization + momentum : float + Float value between 0 and 1 which will be used for momentum in all batch norm + mask_type : str + Either "sparsemax" or "entmax" : this is the masking function to use + group_attention_matrix : torch matrix + Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j + """ + super(TabNetEncoder, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.is_multi_task = isinstance(output_dim, list) + self.n_d = n_d + self.n_a = n_a + self.n_steps = n_steps + self.gamma = gamma + self.epsilon = epsilon + self.n_independent = n_independent + self.n_shared = n_shared + self.virtual_batch_size = virtual_batch_size + self.mask_type = mask_type + self.initial_bn = BatchNorm1d(self.input_dim, momentum=0.01) + self.group_attention_matrix = group_attention_matrix + + if self.group_attention_matrix is None: + # no groups + self.group_attention_matrix = torch.eye(self.input_dim) + self.attention_dim = self.input_dim + else: + self.attention_dim = self.group_attention_matrix.shape[0] + + if self.n_shared > 0: + shared_feat_transform = torch.nn.ModuleList() + for i in range(self.n_shared): + if i == 0: + shared_feat_transform.append( + Linear(self.input_dim, 2 * (n_d + n_a), bias=False) + ) + else: + shared_feat_transform.append( + Linear(n_d + n_a, 2 * (n_d + n_a), bias=False) + ) + + else: + shared_feat_transform = None + + self.initial_splitter = FeatTransformer( + self.input_dim, + n_d + n_a, + shared_feat_transform, + n_glu_independent=self.n_independent, + virtual_batch_size=self.virtual_batch_size, + momentum=momentum, + ) + + self.feat_transformers = torch.nn.ModuleList() + self.att_transformers = torch.nn.ModuleList() + + for step in range(n_steps): + transformer = FeatTransformer( + self.input_dim, + n_d + n_a, + shared_feat_transform, + n_glu_independent=self.n_independent, + virtual_batch_size=self.virtual_batch_size, + momentum=momentum, + ) + attention = AttentiveTransformer( + n_a, + self.attention_dim, + group_matrix=group_attention_matrix, + virtual_batch_size=self.virtual_batch_size, + momentum=momentum, + mask_type=self.mask_type, + ) + self.feat_transformers.append(transformer) + self.att_transformers.append(attention) + + def forward(self, x, prior=None): + x = self.initial_bn(x) + + bs = x.shape[0] # batch size + if prior is None: + prior = torch.ones((bs, self.attention_dim)).to(x.device) + + M_loss = 0 + att = self.initial_splitter(x)[:, self.n_d :] + steps_output = [] + for step in range(self.n_steps): + M = self.att_transformers[step](prior, att) + M_loss += torch.mean( + torch.sum(torch.mul(M, torch.log(M + self.epsilon)), dim=1) + ) + # update prior + prior = torch.mul(self.gamma - M, prior) + # output + M_feature_level = torch.matmul(M, self.group_attention_matrix) + masked_x = torch.mul(M_feature_level, x) + out = self.feat_transformers[step](masked_x) + d = ReLU()(out[:, : self.n_d]) + steps_output.append(d) + # update attention + att = out[:, self.n_d :] + + M_loss /= self.n_steps + return steps_output, M_loss + + def forward_masks(self, x): + x = self.initial_bn(x) + bs = x.shape[0] # batch size + prior = torch.ones((bs, self.attention_dim)).to(x.device) + M_explain = torch.zeros(x.shape).to(x.device) + att = self.initial_splitter(x)[:, self.n_d :] + masks = {} + + for step in range(self.n_steps): + M = self.att_transformers[step](prior, att) + M_feature_level = torch.matmul(M, self.group_attention_matrix) + masks[step] = M_feature_level + # update prior + prior = torch.mul(self.gamma - M, prior) + # output + masked_x = torch.mul(M_feature_level, x) + out = self.feat_transformers[step](masked_x) + d = ReLU()(out[:, : self.n_d]) + # explain + step_importance = torch.sum(d, dim=1) + M_explain += torch.mul(M_feature_level, step_importance.unsqueeze(dim=1)) + # update attention + att = out[:, self.n_d :] + + return M_explain, masks + + +class TabNetDecoder(torch.nn.Module): + def __init__( + self, + input_dim, + n_d=8, + n_steps=3, + n_independent=1, + n_shared=1, + virtual_batch_size=128, + momentum=0.02, + ): + """ + Defines main part of the TabNet network without the embedding layers. + + Parameters + ---------- + input_dim : int + Number of features + output_dim : int or list of int for multi task classification + Dimension of network output + examples : one for regression, 2 for binary classification etc... + n_d : int + Dimension of the prediction layer (usually between 4 and 64) + n_steps : int + Number of successive steps in the network (usually between 3 and 10) + gamma : float + Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0) + n_independent : int + Number of independent GLU layer in each GLU block (default 1) + n_shared : int + Number of independent GLU layer in each GLU block (default 1) + virtual_batch_size : int + Batch size for Ghost Batch Normalization + momentum : float + Float value between 0 and 1 which will be used for momentum in all batch norm + """ + super(TabNetDecoder, self).__init__() + self.input_dim = input_dim + self.n_d = n_d + self.n_steps = n_steps + self.n_independent = n_independent + self.n_shared = n_shared + self.virtual_batch_size = virtual_batch_size + + self.feat_transformers = torch.nn.ModuleList() + + if self.n_shared > 0: + shared_feat_transform = torch.nn.ModuleList() + for i in range(self.n_shared): + shared_feat_transform.append(Linear(n_d, 2 * n_d, bias=False)) + else: + shared_feat_transform = None + + for step in range(n_steps): + transformer = FeatTransformer( + n_d, + n_d, + shared_feat_transform, + n_glu_independent=self.n_independent, + virtual_batch_size=self.virtual_batch_size, + momentum=momentum, + ) + self.feat_transformers.append(transformer) + + self.reconstruction_layer = Linear(n_d, self.input_dim, bias=False) + initialize_non_glu(self.reconstruction_layer, n_d, self.input_dim) + + def forward(self, steps_output): + res = 0 + for step_nb, step_output in enumerate(steps_output): + x = self.feat_transformers[step_nb](step_output) + res = torch.add(res, x) + res = self.reconstruction_layer(res) + return res + + +class TabNetPretraining(torch.nn.Module): + def __init__( + self, + input_dim, + pretraining_ratio=0.2, + n_d=8, + n_a=8, + n_steps=3, + gamma=1.3, + cat_idxs=[], + cat_dims=[], + cat_emb_dim=1, + n_independent=2, + n_shared=2, + epsilon=1e-15, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + n_shared_decoder=1, + n_indep_decoder=1, + group_attention_matrix=None, + ): + super(TabNetPretraining, self).__init__() + + self.cat_idxs = cat_idxs or [] + self.cat_dims = cat_dims or [] + self.cat_emb_dim = cat_emb_dim + + self.input_dim = input_dim + self.n_d = n_d + self.n_a = n_a + self.n_steps = n_steps + self.gamma = gamma + self.epsilon = epsilon + self.n_independent = n_independent + self.n_shared = n_shared + self.mask_type = mask_type + self.pretraining_ratio = pretraining_ratio + self.n_shared_decoder = n_shared_decoder + self.n_indep_decoder = n_indep_decoder + + if self.n_steps <= 0: + raise ValueError("n_steps should be a positive integer.") + if self.n_independent == 0 and self.n_shared == 0: + raise ValueError("n_shared and n_independent can't be both zero.") + + self.virtual_batch_size = virtual_batch_size + self.embedder = EmbeddingGenerator(input_dim, + cat_dims, + cat_idxs, + cat_emb_dim, + group_attention_matrix) + self.post_embed_dim = self.embedder.post_embed_dim + + self.masker = RandomObfuscator(self.pretraining_ratio, + group_matrix=self.embedder.embedding_group_matrix) + self.encoder = TabNetEncoder( + input_dim=self.post_embed_dim, + output_dim=self.post_embed_dim, + n_d=n_d, + n_a=n_a, + n_steps=n_steps, + gamma=gamma, + n_independent=n_independent, + n_shared=n_shared, + epsilon=epsilon, + virtual_batch_size=virtual_batch_size, + momentum=momentum, + mask_type=mask_type, + group_attention_matrix=self.embedder.embedding_group_matrix, + ) + self.decoder = TabNetDecoder( + self.post_embed_dim, + n_d=n_d, + n_steps=n_steps, + n_independent=self.n_indep_decoder, + n_shared=self.n_shared_decoder, + virtual_batch_size=virtual_batch_size, + momentum=momentum, + ) + + def forward(self, x): + """ + Returns: res, embedded_x, obf_vars + res : output of reconstruction + embedded_x : embedded input + obf_vars : which variable where obfuscated + """ + embedded_x = self.embedder(x) + if self.training: + masked_x, obfuscated_groups, obfuscated_vars = self.masker(embedded_x) + # set prior of encoder with obfuscated groups + prior = 1 - obfuscated_groups + steps_out, _ = self.encoder(masked_x, prior=prior) + res = self.decoder(steps_out) + return res, embedded_x, obfuscated_vars + else: + steps_out, _ = self.encoder(embedded_x) + res = self.decoder(steps_out) + return res, embedded_x, torch.ones(embedded_x.shape).to(x.device) + + def forward_masks(self, x): + embedded_x = self.embedder(x) + return self.encoder.forward_masks(embedded_x) + + +class TabNetNoEmbeddings(torch.nn.Module): + def __init__( + self, + input_dim, + output_dim, + n_d=8, + n_a=8, + n_steps=3, + gamma=1.3, + n_independent=2, + n_shared=2, + epsilon=1e-15, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + group_attention_matrix=None, + ): + """ + Defines main part of the TabNet network without the embedding layers. + + Parameters + ---------- + input_dim : int + Number of features + output_dim : int or list of int for multi task classification + Dimension of network output + examples : one for regression, 2 for binary classification etc... + n_d : int + Dimension of the prediction layer (usually between 4 and 64) + n_a : int + Dimension of the attention layer (usually between 4 and 64) + n_steps : int + Number of successive steps in the network (usually between 3 and 10) + gamma : float + Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0) + n_independent : int + Number of independent GLU layer in each GLU block (default 2) + n_shared : int + Number of independent GLU layer in each GLU block (default 2) + epsilon : float + Avoid log(0), this should be kept very low + virtual_batch_size : int + Batch size for Ghost Batch Normalization + momentum : float + Float value between 0 and 1 which will be used for momentum in all batch norm + mask_type : str + Either "sparsemax" or "entmax" : this is the masking function to use + group_attention_matrix : torch matrix + Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j + """ + super(TabNetNoEmbeddings, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.is_multi_task = isinstance(output_dim, list) + self.n_d = n_d + self.n_a = n_a + self.n_steps = n_steps + self.gamma = gamma + self.epsilon = epsilon + self.n_independent = n_independent + self.n_shared = n_shared + self.virtual_batch_size = virtual_batch_size + self.mask_type = mask_type + self.initial_bn = BatchNorm1d(self.input_dim, momentum=0.01) + + self.encoder = TabNetEncoder( + input_dim=input_dim, + output_dim=output_dim, + n_d=n_d, + n_a=n_a, + n_steps=n_steps, + gamma=gamma, + n_independent=n_independent, + n_shared=n_shared, + epsilon=epsilon, + virtual_batch_size=virtual_batch_size, + momentum=momentum, + mask_type=mask_type, + group_attention_matrix=group_attention_matrix + ) + + if self.is_multi_task: + self.multi_task_mappings = torch.nn.ModuleList() + for task_dim in output_dim: + task_mapping = Linear(n_d, task_dim, bias=False) + initialize_non_glu(task_mapping, n_d, task_dim) + self.multi_task_mappings.append(task_mapping) + else: + self.final_mapping = Linear(n_d, output_dim, bias=False) + initialize_non_glu(self.final_mapping, n_d, output_dim) + + def forward(self, x): + res = 0 + steps_output, M_loss = self.encoder(x) + res = torch.sum(torch.stack(steps_output, dim=0), dim=0) + + if self.is_multi_task: + # Result will be in list format + out = [] + for task_mapping in self.multi_task_mappings: + out.append(task_mapping(res)) + else: + out = self.final_mapping(res) + return out, M_loss + + def forward_masks(self, x): + return self.encoder.forward_masks(x) + + +class TabNet(torch.nn.Module): + def __init__( + self, + input_dim, + output_dim, + n_d=8, + n_a=8, + n_steps=3, + gamma=1.3, + cat_idxs=[], + cat_dims=[], + cat_emb_dim=1, + n_independent=2, + n_shared=2, + epsilon=1e-15, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + group_attention_matrix=[], + ): + """ + Defines TabNet network + + Parameters + ---------- + input_dim : int + Initial number of features + output_dim : int + Dimension of network output + examples : one for regression, 2 for binary classification etc... + n_d : int + Dimension of the prediction layer (usually between 4 and 64) + n_a : int + Dimension of the attention layer (usually between 4 and 64) + n_steps : int + Number of successive steps in the network (usually between 3 and 10) + gamma : float + Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0) + cat_idxs : list of int + Index of each categorical column in the dataset + cat_dims : list of int + Number of categories in each categorical column + cat_emb_dim : int or list of int + Size of the embedding of categorical features + if int, all categorical features will have same embedding size + if list of int, every corresponding feature will have specific size + n_independent : int + Number of independent GLU layer in each GLU block (default 2) + n_shared : int + Number of independent GLU layer in each GLU block (default 2) + epsilon : float + Avoid log(0), this should be kept very low + virtual_batch_size : int + Batch size for Ghost Batch Normalization + momentum : float + Float value between 0 and 1 which will be used for momentum in all batch norm + mask_type : str + Either "sparsemax" or "entmax" : this is the masking function to use + group_attention_matrix : torch matrix + Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j + """ + super(TabNet, self).__init__() + self.cat_idxs = cat_idxs or [] + self.cat_dims = cat_dims or [] + self.cat_emb_dim = cat_emb_dim + + self.input_dim = input_dim + self.output_dim = output_dim + self.n_d = n_d + self.n_a = n_a + self.n_steps = n_steps + self.gamma = gamma + self.epsilon = epsilon + self.n_independent = n_independent + self.n_shared = n_shared + self.mask_type = mask_type + + if self.n_steps <= 0: + raise ValueError("n_steps should be a positive integer.") + if self.n_independent == 0 and self.n_shared == 0: + raise ValueError("n_shared and n_independent can't be both zero.") + + self.virtual_batch_size = virtual_batch_size + self.embedder = EmbeddingGenerator(input_dim, + cat_dims, + cat_idxs, + cat_emb_dim, + group_attention_matrix) + self.post_embed_dim = self.embedder.post_embed_dim + + self.tabnet = TabNetNoEmbeddings( + self.post_embed_dim, + output_dim, + n_d, + n_a, + n_steps, + gamma, + n_independent, + n_shared, + epsilon, + virtual_batch_size, + momentum, + mask_type, + self.embedder.embedding_group_matrix + ) + + def forward(self, x): + x = self.embedder(x) + return self.tabnet(x) + + def forward_masks(self, x): + x = self.embedder(x) + return self.tabnet.forward_masks(x) + + +class AttentiveTransformer(torch.nn.Module): + def __init__( + self, + input_dim, + group_dim, + group_matrix, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + ): + """ + Initialize an attention transformer. + + Parameters + ---------- + input_dim : int + Input size + group_dim : int + Number of groups for features + virtual_batch_size : int + Batch size for Ghost Batch Normalization + momentum : float + Float value between 0 and 1 which will be used for momentum in batch norm + mask_type : str + Either "sparsemax" or "entmax" : this is the masking function to use + """ + super(AttentiveTransformer, self).__init__() + self.fc = Linear(input_dim, group_dim, bias=False) + initialize_non_glu(self.fc, input_dim, group_dim) + self.bn = GBN( + group_dim, virtual_batch_size=virtual_batch_size, momentum=momentum + ) + + if mask_type == "sparsemax": + # Sparsemax + self.selector = sparsemax.Sparsemax(dim=-1) + elif mask_type == "entmax": + # Entmax + self.selector = sparsemax.Entmax15(dim=-1) + else: + raise NotImplementedError( + "Please choose either sparsemax" + "or entmax as masktype" + ) + + def forward(self, priors, processed_feat): + x = self.fc(processed_feat) + x = self.bn(x) + x = torch.mul(x, priors) + x = self.selector(x) + return x + + +class FeatTransformer(torch.nn.Module): + def __init__( + self, + input_dim, + output_dim, + shared_layers, + n_glu_independent, + virtual_batch_size=128, + momentum=0.02, + ): + super(FeatTransformer, self).__init__() + """ + Initialize a feature transformer. + + Parameters + ---------- + input_dim : int + Input size + output_dim : int + Output_size + shared_layers : torch.nn.ModuleList + The shared block that should be common to every step + n_glu_independent : int + Number of independent GLU layers + virtual_batch_size : int + Batch size for Ghost Batch Normalization within GLU block(s) + momentum : float + Float value between 0 and 1 which will be used for momentum in batch norm + """ + + params = { + "n_glu": n_glu_independent, + "virtual_batch_size": virtual_batch_size, + "momentum": momentum, + } + + if shared_layers is None: + # no shared layers + self.shared = torch.nn.Identity() + is_first = True + else: + self.shared = GLU_Block( + input_dim, + output_dim, + first=True, + shared_layers=shared_layers, + n_glu=len(shared_layers), + virtual_batch_size=virtual_batch_size, + momentum=momentum, + ) + is_first = False + + if n_glu_independent == 0: + # no independent layers + self.specifics = torch.nn.Identity() + else: + spec_input_dim = input_dim if is_first else output_dim + self.specifics = GLU_Block( + spec_input_dim, output_dim, first=is_first, **params + ) + + def forward(self, x): + x = self.shared(x) + x = self.specifics(x) + return x + + +class GLU_Block(torch.nn.Module): + """ + Independent GLU block, specific to each step + """ + + def __init__( + self, + input_dim, + output_dim, + n_glu=2, + first=False, + shared_layers=None, + virtual_batch_size=128, + momentum=0.02, + ): + super(GLU_Block, self).__init__() + self.first = first + self.shared_layers = shared_layers + self.n_glu = n_glu + self.glu_layers = torch.nn.ModuleList() + + params = {"virtual_batch_size": virtual_batch_size, "momentum": momentum} + + fc = shared_layers[0] if shared_layers else None + self.glu_layers.append(GLU_Layer(input_dim, output_dim, fc=fc, **params)) + for glu_id in range(1, self.n_glu): + fc = shared_layers[glu_id] if shared_layers else None + self.glu_layers.append(GLU_Layer(output_dim, output_dim, fc=fc, **params)) + + def forward(self, x): + scale = torch.sqrt(torch.FloatTensor([0.5]).to(x.device)) + if self.first: # the first layer of the block has no scale multiplication + x = self.glu_layers[0](x) + layers_left = range(1, self.n_glu) + else: + layers_left = range(self.n_glu) + + for glu_id in layers_left: + x = torch.add(x, self.glu_layers[glu_id](x)) + x = x * scale + return x + + +class GLU_Layer(torch.nn.Module): + def __init__( + self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02 + ): + super(GLU_Layer, self).__init__() + + self.output_dim = output_dim + if fc: + self.fc = fc + else: + self.fc = Linear(input_dim, 2 * output_dim, bias=False) + initialize_glu(self.fc, input_dim, 2 * output_dim) + + self.bn = GBN( + 2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum + ) + + def forward(self, x): + x = self.fc(x) + x = self.bn(x) + out = torch.mul(x[:, : self.output_dim], torch.sigmoid(x[:, self.output_dim :])) + return out + + +class EmbeddingGenerator(torch.nn.Module): + """ + Classical embeddings generator + """ + + def __init__(self, input_dim, cat_dims, cat_idxs, cat_emb_dims, group_matrix): + """This is an embedding module for an entire set of features + + Parameters + ---------- + input_dim : int + Number of features coming as input (number of columns) + cat_dims : list of int + Number of modalities for each categorial features + If the list is empty, no embeddings will be done + cat_idxs : list of int + Positional index for each categorical features in inputs + cat_emb_dim : list of int + Embedding dimension for each categorical features + If int, the same embedding dimension will be used for all categorical features + group_matrix : torch matrix + Original group matrix before embeddings + """ + super(EmbeddingGenerator, self).__init__() + + if cat_dims == [] and cat_idxs == []: + self.skip_embedding = True + self.post_embed_dim = input_dim + self.embedding_group_matrix = group_matrix.to(group_matrix.device) + return + else: + self.skip_embedding = False + + self.post_embed_dim = int(input_dim + np.sum(cat_emb_dims) - len(cat_emb_dims)) + + self.embeddings = torch.nn.ModuleList() + + for cat_dim, emb_dim in zip(cat_dims, cat_emb_dims): + self.embeddings.append(torch.nn.Embedding(cat_dim, emb_dim)) + + # record continuous indices + self.continuous_idx = torch.ones(input_dim, dtype=torch.bool) + self.continuous_idx[cat_idxs] = 0 + + # update group matrix + n_groups = group_matrix.shape[0] + self.embedding_group_matrix = torch.empty((n_groups, self.post_embed_dim), + device=group_matrix.device) + for group_idx in range(n_groups): + post_emb_idx = 0 + cat_feat_counter = 0 + for init_feat_idx in range(input_dim): + if self.continuous_idx[init_feat_idx] == 1: + # this means that no embedding is applied to this column + self.embedding_group_matrix[group_idx, post_emb_idx] = group_matrix[group_idx, init_feat_idx] # noqa + post_emb_idx += 1 + else: + # this is a categorical feature which creates multiple embeddings + n_embeddings = cat_emb_dims[cat_feat_counter] + self.embedding_group_matrix[group_idx, post_emb_idx:post_emb_idx+n_embeddings] = group_matrix[group_idx, init_feat_idx] / n_embeddings # noqa + post_emb_idx += n_embeddings + cat_feat_counter += 1 + + def forward(self, x): + """ + Apply embeddings to inputs + Inputs should be (batch_size, input_dim) + Outputs will be of size (batch_size, self.post_embed_dim) + """ + if self.skip_embedding: + # no embeddings required + return x + + cols = [] + cat_feat_counter = 0 + for feat_init_idx, is_continuous in enumerate(self.continuous_idx): + # Enumerate through continuous idx boolean mask to apply embeddings + if is_continuous: + cols.append(x[:, feat_init_idx].float().view(-1, 1)) + else: + cols.append( + self.embeddings[cat_feat_counter](x[:, feat_init_idx].long()) + ) + cat_feat_counter += 1 + # concat + post_embeddings = torch.cat(cols, dim=1) + return post_embeddings + + +class RandomObfuscator(torch.nn.Module): + """ + Create and applies obfuscation masks. + The obfuscation is done at group level to match attention. + """ + + def __init__(self, pretraining_ratio, group_matrix): + """ + This create random obfuscation for self suppervised pretraining + Parameters + ---------- + pretraining_ratio : float + Ratio of feature to randomly discard for reconstruction + + """ + super(RandomObfuscator, self).__init__() + self.pretraining_ratio = pretraining_ratio + # group matrix is set to boolean here to pass all posssible information + self.group_matrix = (group_matrix > 0) + 0. + self.num_groups = group_matrix.shape[0] + + def forward(self, x): + """ + Generate random obfuscation mask. + + Returns + ------- + masked input and obfuscated variables. + """ + bs = x.shape[0] + + obfuscated_groups = torch.bernoulli( + self.pretraining_ratio * torch.ones((bs, self.num_groups), device=x.device) + ) + obfuscated_vars = torch.matmul(obfuscated_groups, self.group_matrix) + masked_input = torch.mul(1 - obfuscated_vars, x) + return masked_input, obfuscated_groups, obfuscated_vars diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py new file mode 100644 index 00000000..fda3bfd4 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py @@ -0,0 +1,552 @@ +from torch.utils.data import Dataset +from torch.utils.data import DataLoader, WeightedRandomSampler +import torch +import numpy as np +import scipy +import json +from sklearn.utils import check_array +import pandas as pd +import warnings + + +class TorchDataset(Dataset): + """ + Format for numpy array + + Parameters + ---------- + X : 2D array + The input matrix + y : 2D array + The one-hot encoded target + """ + + def __init__(self, x, y): + self.x = x + self.y = y + + def __len__(self): + return len(self.x) + + def __getitem__(self, index): + x, y = self.x[index], self.y[index] + return x, y + + +class SparseTorchDataset(Dataset): + """ + Format for csr_matrix + + Parameters + ---------- + X : CSR matrix + The input matrix + y : 2D array + The one-hot encoded target + """ + + def __init__(self, x, y): + self.x = x + self.y = y + + def __len__(self): + return self.x.shape[0] + + def __getitem__(self, index): + x = torch.from_numpy(self.x[index].toarray()[0]).float() + y = self.y[index] + return x, y + + +class PredictDataset(Dataset): + """ + Format for numpy array + + Parameters + ---------- + X : 2D array + The input matrix + """ + + def __init__(self, x): + self.x = x + + def __len__(self): + return len(self.x) + + def __getitem__(self, index): + x = self.x[index] + return x + + +class SparsePredictDataset(Dataset): + """ + Format for csr_matrix + + Parameters + ---------- + X : CSR matrix + The input matrix + """ + + def __init__(self, x): + self.x = x + + def __len__(self): + return self.x.shape[0] + + def __getitem__(self, index): + x = torch.from_numpy(self.x[index].toarray()[0]).float() + return x + + +def create_sampler(weights, y_train): + """ + This creates a sampler from the given weights + + Parameters + ---------- + weights : either 0, 1, dict or iterable + if 0 (default) : no weights will be applied + if 1 : classification only, will balanced class with inverse frequency + if dict : keys are corresponding class values are sample weights + if iterable : list or np array must be of length equal to nb elements + in the training set + y_train : np.array + Training targets + """ + if isinstance(weights, int): + if weights == 0: + need_shuffle = True + sampler = None + elif weights == 1: + need_shuffle = False + class_sample_count = np.array( + [len(np.where(y_train == t)[0]) for t in np.unique(y_train)] + ) + + weights = 1.0 / class_sample_count + + samples_weight = np.array([weights[t] for t in y_train]) + + samples_weight = torch.from_numpy(samples_weight) + samples_weight = samples_weight.double() + sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) + else: + raise ValueError("Weights should be either 0, 1, dictionnary or list.") + elif isinstance(weights, dict): + # custom weights per class + need_shuffle = False + samples_weight = np.array([weights[t] for t in y_train]) + sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) + else: + # custom weights + if len(weights) != len(y_train): + raise ValueError("Custom weights should match number of train samples.") + need_shuffle = False + samples_weight = np.array(weights) + sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) + return need_shuffle, sampler + + +def create_dataloaders( + X_train, y_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory +): + """ + Create dataloaders with or without subsampling depending on weights and balanced. + + Parameters + ---------- + X_train : np.ndarray + Training data + y_train : np.array + Mapped Training targets + eval_set : list of tuple + List of eval tuple set (X, y) + weights : either 0, 1, dict or iterable + if 0 (default) : no weights will be applied + if 1 : classification only, will balanced class with inverse frequency + if dict : keys are corresponding class values are sample weights + if iterable : list or np array must be of length equal to nb elements + in the training set + batch_size : int + how many samples per batch to load + num_workers : int + how many subprocesses to use for data loading. 0 means that the data + will be loaded in the main process + drop_last : bool + set to True to drop the last incomplete batch, if the dataset size is not + divisible by the batch size. If False and the size of dataset is not + divisible by the batch size, then the last batch will be smaller + pin_memory : bool + Whether to pin GPU memory during training + + Returns + ------- + train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader + Training and validation dataloaders + """ + need_shuffle, sampler = create_sampler(weights, y_train) + + if scipy.sparse.issparse(X_train): + train_dataloader = DataLoader( + SparseTorchDataset(X_train.astype(np.float32), y_train), + batch_size=batch_size, + sampler=sampler, + shuffle=need_shuffle, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=pin_memory, + ) + else: + train_dataloader = DataLoader( + TorchDataset(X_train.astype(np.float32), y_train), + batch_size=batch_size, + sampler=sampler, + shuffle=need_shuffle, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=pin_memory, + ) + + valid_dataloaders = [] + for X, y in eval_set: + if scipy.sparse.issparse(X): + valid_dataloaders.append( + DataLoader( + SparseTorchDataset(X.astype(np.float32), y), + batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + pin_memory=pin_memory, + ) + ) + else: + valid_dataloaders.append( + DataLoader( + TorchDataset(X.astype(np.float32), y), + batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + pin_memory=pin_memory, + ) + ) + + return train_dataloader, valid_dataloaders + + +def create_explain_matrix(input_dim, cat_emb_dim, cat_idxs, post_embed_dim): + """ + This is a computational trick. + In order to rapidly sum importances from same embeddings + to the initial index. + + Parameters + ---------- + input_dim : int + Initial input dim + cat_emb_dim : int or list of int + if int : size of embedding for all categorical feature + if list of int : size of embedding for each categorical feature + cat_idxs : list of int + Initial position of categorical features + post_embed_dim : int + Post embedding inputs dimension + + Returns + ------- + reducing_matrix : np.array + Matrix of dim (post_embed_dim, input_dim) to performe reduce + """ + + if isinstance(cat_emb_dim, int): + all_emb_impact = [cat_emb_dim - 1] * len(cat_idxs) + else: + all_emb_impact = [emb_dim - 1 for emb_dim in cat_emb_dim] + + acc_emb = 0 + nb_emb = 0 + indices_trick = [] + for i in range(input_dim): + if i not in cat_idxs: + indices_trick.append([i + acc_emb]) + else: + indices_trick.append( + range(i + acc_emb, i + acc_emb + all_emb_impact[nb_emb] + 1) + ) + acc_emb += all_emb_impact[nb_emb] + nb_emb += 1 + + reducing_matrix = np.zeros((post_embed_dim, input_dim)) + for i, cols in enumerate(indices_trick): + reducing_matrix[cols, i] = 1 + + return scipy.sparse.csc_matrix(reducing_matrix) + + +def create_group_matrix(list_groups, input_dim): + """ + Create the group matrix corresponding to the given list_groups + + Parameters + ---------- + - list_groups : list of list of int + Each element is a list representing features in the same group. + One feature should appear in maximum one group. + Feature that don't get assigned a group will be in their own group of one feature. + - input_dim : number of feature in the initial dataset + + Returns + ------- + - group_matrix : torch matrix + A matrix of size (n_groups, input_dim) + where m_ij represents the importance of feature j in group i + The rows must some to 1 as each group is equally important a priori. + + """ + check_list_groups(list_groups, input_dim) + + if len(list_groups) == 0: + group_matrix = torch.eye(input_dim) + return group_matrix + else: + n_groups = input_dim - int(np.sum([len(gp) - 1 for gp in list_groups])) + group_matrix = torch.zeros((n_groups, input_dim)) + + remaining_features = [feat_idx for feat_idx in range(input_dim)] + + current_group_idx = 0 + for group in list_groups: + group_size = len(group) + for elem_idx in group: + # add importrance of element in group matrix and corresponding group + group_matrix[current_group_idx, elem_idx] = 1 / group_size + # remove features from list of features + remaining_features.remove(elem_idx) + # move to next group + current_group_idx += 1 + # features not mentionned in list_groups get assigned their own group of singleton + for remaining_feat_idx in remaining_features: + group_matrix[current_group_idx, remaining_feat_idx] = 1 + current_group_idx += 1 + return group_matrix + + +def check_list_groups(list_groups, input_dim): + """ + Check that list groups: + - is a list of list + - does not contain twice the same feature in different groups + - does not contain unknown features (>= input_dim) + - does not contain empty groups + Parameters + ---------- + - list_groups : list of list of int + Each element is a list representing features in the same group. + One feature should appear in maximum one group. + Feature that don't get assign a group will be in their own group of one feature. + - input_dim : number of feature in the initial dataset + """ + assert isinstance(list_groups, list), "list_groups must be a list of list." + + if len(list_groups) == 0: + return + else: + for group_pos, group in enumerate(list_groups): + msg = f"Groups must be given as a list of list, but found {group} in position {group_pos}." # noqa + assert isinstance(group, list), msg + assert len(group) > 0, "Empty groups are forbidding please remove empty groups []" + + n_elements_in_groups = np.sum([len(group) for group in list_groups]) + flat_list = [] + for group in list_groups: + flat_list.extend(group) + unique_elements = np.unique(flat_list) + n_unique_elements_in_groups = len(unique_elements) + msg = f"One feature can only appear in one group, please check your grouped_features." + assert n_unique_elements_in_groups == n_elements_in_groups, msg + + highest_feat = np.max(unique_elements) + assert highest_feat < input_dim, f"Number of features is {input_dim} but one group contains {highest_feat}." # noqa + return + + +def filter_weights(weights): + """ + This function makes sure that weights are in correct format for + regression and multitask TabNet + + Parameters + ---------- + weights : int, dict or list + Initial weights parameters given by user + + Returns + ------- + None : This function will only throw an error if format is wrong + """ + err_msg = """Please provide a list or np.array of weights for """ + err_msg += """regression, multitask or pretraining: """ + if isinstance(weights, int): + if weights == 1: + raise ValueError(err_msg + "1 given.") + if isinstance(weights, dict): + raise ValueError(err_msg + "Dict given.") + return + + +def validate_eval_set(eval_set, eval_name, X_train, y_train): + """Check if the shapes of eval_set are compatible with (X_train, y_train). + + Parameters + ---------- + eval_set : list of tuple + List of eval tuple set (X, y). + The last one is used for early stopping + eval_name : list of str + List of eval set names. + X_train : np.ndarray + Train owned products + y_train : np.array + Train targeted products + + Returns + ------- + eval_names : list of str + Validated list of eval_names. + eval_set : list of tuple + Validated list of eval_set. + + """ + eval_name = eval_name or [f"val_{i}" for i in range(len(eval_set))] + + assert len(eval_set) == len( + eval_name + ), "eval_set and eval_name have not the same length" + if len(eval_set) > 0: + assert all( + len(elem) == 2 for elem in eval_set + ), "Each tuple of eval_set need to have two elements" + for name, (X, y) in zip(eval_name, eval_set): + check_input(X) + msg = ( + f"Dimension mismatch between X_{name} " + + f"{X.shape} and X_train {X_train.shape}" + ) + assert len(X.shape) == len(X_train.shape), msg + + msg = ( + f"Dimension mismatch between y_{name} " + + f"{y.shape} and y_train {y_train.shape}" + ) + assert len(y.shape) == len(y_train.shape), msg + + msg = ( + f"Number of columns is different between X_{name} " + + f"({X.shape[1]}) and X_train ({X_train.shape[1]})" + ) + assert X.shape[1] == X_train.shape[1], msg + + if len(y_train.shape) == 2: + msg = ( + f"Number of columns is different between y_{name} " + + f"({y.shape[1]}) and y_train ({y_train.shape[1]})" + ) + assert y.shape[1] == y_train.shape[1], msg + msg = ( + f"You need the same number of rows between X_{name} " + + f"({X.shape[0]}) and y_{name} ({y.shape[0]})" + ) + assert X.shape[0] == y.shape[0], msg + + return eval_name, eval_set + + +def define_device(device_name): + """ + Define the device to use during training and inference. + If auto it will detect automatically whether to use cuda or cpu + + Parameters + ---------- + device_name : str + Either "auto", "cpu" or "cuda" + + Returns + ------- + str + Either "cpu" or "cuda" + """ + if device_name == "auto": + if torch.cuda.is_available(): + return "cuda" + else: + return "cpu" + elif device_name == "cuda" and not torch.cuda.is_available(): + return "cpu" + else: + return device_name + + +class ComplexEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, (np.generic, np.ndarray)): + return obj.tolist() + # Let the base class default method raise the TypeError + return json.JSONEncoder.default(self, obj) + + +def check_input(X): + """ + Raise a clear error if X is a pandas dataframe + and check array according to scikit rules + """ + if isinstance(X, (pd.DataFrame, pd.Series)): + err_message = "Pandas DataFrame are not supported: apply X.values when calling fit" + raise TypeError(err_message) + check_array(X, accept_sparse=True) + + +def check_warm_start(warm_start, from_unsupervised): + """ + Gives a warning about ambiguous usage of the two parameters. + """ + if warm_start and from_unsupervised is not None: + warn_msg = "warm_start=True and from_unsupervised != None: " + warn_msg = "warm_start will be ignore, training will start from unsupervised weights" + warnings.warn(warn_msg) + return + + +def check_embedding_parameters(cat_dims, cat_idxs, cat_emb_dim): + """ + Check parameters related to embeddings and rearrange them in a unique manner. + """ + if (cat_dims == []) ^ (cat_idxs == []): + if cat_dims == []: + msg = "If cat_idxs is non-empty, cat_dims must be defined as a list of same length." + else: + msg = "If cat_dims is non-empty, cat_idxs must be defined as a list of same length." + raise ValueError(msg) + elif len(cat_dims) != len(cat_idxs): + msg = "The lists cat_dims and cat_idxs must have the same length." + raise ValueError(msg) + + if isinstance(cat_emb_dim, int): + cat_emb_dims = [cat_emb_dim] * len(cat_idxs) + else: + cat_emb_dims = cat_emb_dim + + # check that all embeddings are provided + if len(cat_emb_dims) != len(cat_dims): + msg = f"""cat_emb_dim and cat_dims must be lists of same length, got {len(cat_emb_dims)} + and {len(cat_dims)}""" + raise ValueError(msg) + + # Rearrange to get reproducible seeds with different ordering + if len(cat_idxs) > 0: + sorted_idxs = np.argsort(cat_idxs) + cat_dims = [cat_dims[i] for i in sorted_idxs] + cat_emb_dims = [cat_emb_dims[i] for i in sorted_idxs] + + return cat_dims, cat_idxs, cat_emb_dims From 984f4b08f7d7748c6764823dab29c2551da950f5 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 15 Aug 2023 08:42:42 +0000 Subject: [PATCH 12/49] not done still --- .../torch_based/autoint/autoint_utils.py | 1 - .../pytorch_tabnet/augmentations.py | 22 +++++++--- .../torch_based/pytorch_tabnet/callbacks.py | 6 +-- .../torch_based/pytorch_tabnet/metrics.py | 14 ++---- .../pytorch_tabnet/multiclass_utils.py | 39 ++++------------- .../torch_based/pytorch_tabnet/multitask.py | 25 +++-------- .../torch_based/pytorch_tabnet/pretraining.py | 28 ++++-------- .../pytorch_tabnet/pretraining_utils.py | 15 ++----- .../torch_based/pytorch_tabnet/sparsemax.py | 4 +- .../torch_based/pytorch_tabnet/tab_model.py | 34 ++++++--------- .../torch_based/pytorch_tabnet/utils.py | 43 +++++-------------- 11 files changed, 72 insertions(+), 159 deletions(-) diff --git a/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py b/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py index c96b3241..ba047d52 100644 --- a/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py +++ b/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py @@ -13,7 +13,6 @@ UniformEmbeddingInfo = namedtuple("EmbeddingInfo", ["num_fields", "embedding_size", "output_size"]) - class LeakyGate(nn.Module): """LeakyGate from https://github.com/jrfiedler/xynn. diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py index 287fa365..b520c0b0 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py @@ -3,7 +3,7 @@ import numpy as np -class RegressionSMOTE(): +class RegressionSMOTE: """ Apply SMOTE @@ -11,6 +11,7 @@ class RegressionSMOTE(): The target will be averaged as well (this might work with binary classification and certain loss), following a beta distribution. """ + def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0): "" self.seed = seed @@ -19,7 +20,7 @@ def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0): self.alpha = alpha self.beta = beta self.p = p - if (p < 0.) or (p > 1.0): + if (p < 0.0) or (p > 1.0): raise ValueError("Value of p should be between 0. and 1.") def _set_seed(self): @@ -38,21 +39,26 @@ def __call__(self, X, y): index_permute = torch.randperm(batch_size, device=self.device) X[idx_to_change] = random_betas[idx_to_change, None] * X[idx_to_change] - X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view(X[idx_to_change].size()) # noqa + X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view( + X[idx_to_change].size() + ) # noqa y[idx_to_change] = random_betas[idx_to_change, None] * y[idx_to_change] - y[idx_to_change] += (1 - random_betas[idx_to_change, None]) * y[index_permute][idx_to_change].view(y[idx_to_change].size()) # noqa + y[idx_to_change] += (1 - random_betas[idx_to_change, None]) * y[index_permute][idx_to_change].view( + y[idx_to_change].size() + ) # noqa return X, y -class ClassificationSMOTE(): +class ClassificationSMOTE: """ Apply SMOTE for classification tasks. This will average a percentage p of the elements in the batch with other elements. The target will stay unchanged and keep the value of the most important row in the mix. """ + def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0): "" self.seed = seed @@ -61,7 +67,7 @@ def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0): self.alpha = alpha self.beta = beta self.p = p - if (p < 0.) or (p > 1.0): + if (p < 0.0) or (p > 1.0): raise ValueError("Value of p should be between 0. and 1.") def _set_seed(self): @@ -80,6 +86,8 @@ def __call__(self, X, y): index_permute = torch.randperm(batch_size, device=self.device) X[idx_to_change] = random_betas[idx_to_change, None] * X[idx_to_change] - X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view(X[idx_to_change].size()) # noqa + X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view( + X[idx_to_change].size() + ) # noqa return X, y diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py index cb031d54..5c266502 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py @@ -224,9 +224,9 @@ def on_epoch_end(self, epoch, logs=None): def on_batch_end(self, batch, logs=None): batch_size = logs["batch_size"] - self.epoch_loss = ( - self.samples_seen * self.epoch_loss + batch_size * logs["loss"] - ) / (self.samples_seen + batch_size) + self.epoch_loss = (self.samples_seen * self.epoch_loss + batch_size * logs["loss"]) / ( + self.samples_seen + batch_size + ) self.samples_seen += batch_size def __getitem__(self, name): diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py index e8ad8181..ae716f33 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py @@ -157,9 +157,7 @@ def __call__(self, y_true, y_pred): logs = {} for metric in self.metrics: if isinstance(y_pred, list): - res = np.mean( - [metric(y_true[:, i], y_pred[i]) for i in range(len(y_pred))] - ) + res = np.mean([metric(y_true[:, i], y_pred[i]) for i in range(len(y_pred))]) else: res = metric(y_true, y_pred) logs[self.prefix + metric._name] = res @@ -191,9 +189,7 @@ def get_metrics_by_names(cls, names): available_names = [metric()._name for metric in available_metrics] metrics = [] for name in names: - assert ( - name in available_names - ), f"{name} is not available, choose in {available_names}" + assert name in available_names, f"{name} is not available, choose in {available_names}" idx = available_names.index(name) metric = available_metrics[idx]() metrics.append(metric) @@ -463,11 +459,7 @@ def __call__(self, y_pred, embedded_x, obf_vars): float MSE of predictions vs targets. """ - return UnsupervisedLossNumpy( - y_pred, - embedded_x, - obf_vars - ) + return UnsupervisedLossNumpy(y_pred, embedded_x, obf_vars) class RMSE(Metric): diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py index 8dbf08c5..b6fa2ef3 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py @@ -32,12 +32,7 @@ def _assert_all_finite(X, allow_nan=False): pass elif is_float: msg_err = "Input contains {} or a value too large for {!r}." - if ( - allow_nan - and np.isinf(X).any() - or not allow_nan - and not np.isfinite(X).all() - ): + if allow_nan and np.isinf(X).any() or not allow_nan and not np.isfinite(X).all(): type_err = "infinity" if allow_nan else "NaN, infinity" raise ValueError(msg_err.format(type_err, X.dtype)) # for object dtype data, we only check for NaNs (GH-13254) @@ -183,17 +178,12 @@ def is_multilabel(y): return ( len(y.data) == 0 or np.unique(y.data).size == 1 - and ( - y.dtype.kind in "biu" - or _is_integral_float(np.unique(y.data)) # bool, int, uint - ) + and (y.dtype.kind in "biu" or _is_integral_float(np.unique(y.data))) # bool, int, uint ) else: labels = np.unique(y) - return len(labels) < 3 and ( - y.dtype.kind in "biu" or _is_integral_float(labels) # bool, int, uint - ) + return len(labels) < 3 and (y.dtype.kind in "biu" or _is_integral_float(labels)) # bool, int, uint def check_classification_targets(y): @@ -282,14 +272,10 @@ def type_of_target(y): >>> type_of_target(np.array([[0, 1], [1, 1]])) 'multilabel-indicator' """ - valid = ( - isinstance(y, (Sequence, spmatrix)) or hasattr(y, "__array__") - ) and not isinstance(y, str) + valid = (isinstance(y, (Sequence, spmatrix)) or hasattr(y, "__array__")) and not isinstance(y, str) if not valid: - raise ValueError( - "Expected array-like (array or non-string sequence), " "got %r" % y - ) + raise ValueError("Expected array-like (array or non-string sequence), " "got %r" % y) sparseseries = y.__class__.__name__ == "SparseSeries" if sparseseries: @@ -306,11 +292,7 @@ def type_of_target(y): # The old sequence of sequences format try: - if ( - not hasattr(y[0], "__array__") - and isinstance(y[0], Sequence) - and not isinstance(y[0], str) - ): + if not hasattr(y[0], "__array__") and isinstance(y[0], Sequence) and not isinstance(y[0], str): raise ValueError( "You appear to be using a legacy multi-label data" " representation. Sequence of sequences are no" @@ -348,9 +330,7 @@ def type_of_target(y): def check_unique_type(y): target_types = pd.Series(y).map(type).unique() if len(target_types) != 1: - raise TypeError( - f"Values on the target must have the same type. Target has types {target_types}" - ) + raise TypeError(f"Values on the target must have the same type. Target has types {target_types}") def infer_output_dim(y_train): @@ -408,10 +388,7 @@ def infer_multitask_output(y_train): """ if len(y_train.shape) < 2: - raise ValueError( - "y_train should be of shape (n_examples, n_tasks)" - + f"but got {y_train.shape}" - ) + raise ValueError("y_train should be of shape (n_examples, n_tasks)" + f"but got {y_train.shape}") nb_tasks = y_train.shape[1] tasks_dims = [] tasks_labels = [] diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py index da836203..309c0e39 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py @@ -11,9 +11,9 @@ class TabNetMultiTaskClassifier(TabModel): def __post_init__(self): super(TabNetMultiTaskClassifier, self).__post_init__() - self._task = 'classification' + self._task = "classification" self._default_loss = torch.nn.functional.cross_entropy - self._default_metric = 'logloss' + self._default_metric = "logloss" def prepare_target(self, y): y_mapped = y.copy() @@ -43,9 +43,7 @@ def compute_loss(self, y_pred, y_true): y_true = y_true.long() if isinstance(self.loss_fn, list): # if you specify a different loss for each task - for task_loss, task_output, task_id in zip( - self.loss_fn, y_pred, range(len(self.loss_fn)) - ): + for task_loss, task_output, task_id in zip(self.loss_fn, y_pred, range(len(self.loss_fn))): loss += task_loss(task_output, y_true[:, task_id]) else: # same loss function is applied to all tasks @@ -72,12 +70,10 @@ def update_fit_params(self, X_train, y_train, eval_set, weights): self.output_dim = output_dim self.classes_ = train_labels self.target_mapper = [ - {class_label: index for index, class_label in enumerate(classes)} - for classes in self.classes_ + {class_label: index for index, class_label in enumerate(classes)} for classes in self.classes_ ] self.preds_mapper = [ - {str(index): str(class_label) for index, class_label in enumerate(classes)} - for classes in self.classes_ + {str(index): str(class_label) for index, class_label in enumerate(classes)} for classes in self.classes_ ] self.updated_weights = weights filter_weights(self.updated_weights) @@ -116,11 +112,7 @@ def predict(self, X): data = data.to(self.device).float() output, _ = self.network(data) predictions = [ - torch.argmax(torch.nn.Softmax(dim=1)(task_output), dim=1) - .cpu() - .detach() - .numpy() - .reshape(-1) + torch.argmax(torch.nn.Softmax(dim=1)(task_output), dim=1).cpu().detach().numpy().reshape(-1) for task_output in output ] @@ -168,10 +160,7 @@ def predict_proba(self, X): for data in dataloader: data = data.to(self.device).float() output, _ = self.network(data) - predictions = [ - torch.nn.Softmax(dim=1)(task_output).cpu().detach().numpy() - for task_output in output - ] + predictions = [torch.nn.Softmax(dim=1)(task_output).cpu().detach().numpy() for task_output in output] for task_idx in range(len(self.output_dim)): results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]] res = [np.vstack(task_res) for task_res in results.values()] diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py index 87de306d..9044d497 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py @@ -27,9 +27,9 @@ class TabNetPretrainer(TabModel): def __post_init__(self): super(TabNetPretrainer, self).__post_init__() - self._task = 'unsupervised' + self._task = "unsupervised" self._default_loss = UnsupervisedLoss - self._default_metric = 'unsup_loss_numpy' + self._default_metric = "unsup_loss_numpy" def prepare_target(self, y): return y @@ -61,7 +61,7 @@ def fit( drop_last=True, callbacks=None, pin_memory=True, - warm_start=False + warm_start=False, ): """Train a neural network stored in self.network Using train_dataloader for training data and @@ -130,9 +130,7 @@ def fit( # Validate and reformat eval set depending on training data eval_names = validate_eval_set(eval_set, eval_name, X_train) - train_dataloader, valid_dataloaders = self._construct_loaders( - X_train, eval_set - ) + train_dataloader, valid_dataloaders = self._construct_loaders(X_train, eval_set) if not hasattr(self, "network") or not warm_start: # model has never been fitted before of warm_start is False @@ -159,9 +157,7 @@ def fit( self._predict_epoch(eval_name, valid_dataloader) # Call method on_epoch_end for all callbacks - self._callback_container.on_epoch_end( - epoch_idx, logs=self.history.epoch_metrics - ) + self._callback_container.on_epoch_end(epoch_idx, logs=self.history.epoch_metrics) if self._stop_training: break @@ -172,7 +168,7 @@ def fit( def _set_network(self): """Setup the network and explain matrix.""" - if not hasattr(self, 'pretraining_ratio'): + if not hasattr(self, "pretraining_ratio"): self.pretraining_ratio = 0.5 torch.manual_seed(self.seed) @@ -227,9 +223,7 @@ def _set_metrics(self, eval_names): # Set metric container for each sets self._metric_container_dict = {} for name in eval_names: - self._metric_container_dict.update( - {name: UnsupMetricContainer(metrics, prefix=f"{name}_")} - ) + self._metric_container_dict.update({name: UnsupMetricContainer(metrics, prefix=f"{name}_")}) self._metrics = [] self._metrics_names = [] @@ -238,9 +232,7 @@ def _set_metrics(self, eval_names): self._metrics_names.extend(metric_container.names) # Early stopping metric is the last eval metric - self.early_stopping_metric = ( - self._metrics_names[-1] if len(self._metrics_names) > 0 else None - ) + self.early_stopping_metric = self._metrics_names[-1] if len(self._metrics_names) > 0 else None def _construct_loaders(self, X_train, eval_set): """Generate dataloaders for unsupervised train and eval set. @@ -354,9 +346,7 @@ def _predict_epoch(self, name, loader): list_embedded_x.append(embedded_x.cpu().detach().numpy()) list_obfuscation.append(obf_vars.cpu().detach().numpy()) - output, embedded_x, obf_vars = self.stack_batches(list_output, - list_embedded_x, - list_obfuscation) + output, embedded_x, obf_vars = self.stack_batches(list_output, list_embedded_x, list_obfuscation) metrics_logs = self._metric_container_dict[name](output, embedded_x, obf_vars) self.network.train() diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py index 0874be95..d35e34f2 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py @@ -1,16 +1,9 @@ from torch.utils.data import DataLoader -from pytorch_tabnet.utils import ( - create_sampler, - SparsePredictDataset, - PredictDataset, - check_input -) +from pytorch_tabnet.utils import create_sampler, SparsePredictDataset, PredictDataset, check_input import scipy -def create_dataloaders( - X_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory -): +def create_dataloaders(X_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory): """ Create dataloaders with or without subsampling depending on weights and balanced. @@ -114,9 +107,7 @@ def validate_eval_set(eval_set, eval_name, X_train): """ eval_names = eval_name or [f"val_{i}" for i in range(len(eval_set))] - assert len(eval_set) == len( - eval_names - ), "eval_set and eval_name have not the same length" + assert len(eval_set) == len(eval_names), "eval_set and eval_name have not the same length" for set_nb, X in enumerate(eval_set): check_input(X) diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py index 9862efa4..53a71792 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py @@ -100,7 +100,6 @@ def _threshold_and_support(input, dim=-1): class Sparsemax(nn.Module): - def __init__(self, dim=-1): self.dim = dim super(Sparsemax, self).__init__() @@ -131,7 +130,7 @@ def forward(ctx, input, dim=-1): @staticmethod def backward(ctx, grad_output): - Y, = ctx.saved_tensors + (Y,) = ctx.saved_tensors gppr = Y.sqrt() # = 1 / g'' (Y) dX = grad_output * gppr q = dX.sum(ctx.dim) / gppr.sum(ctx.dim) @@ -195,7 +194,6 @@ def _backward(output, grad_output): class Entmax15(nn.Module): - def __init__(self, dim=-1): self.dim = dim super(Entmax15, self).__init__() diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py index ff01991c..32115c8c 100755 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py @@ -11,9 +11,9 @@ class TabNetClassifier(TabModel): def __post_init__(self): super(TabNetClassifier, self).__post_init__() - self._task = 'classification' + self._task = "classification" self._default_loss = torch.nn.functional.cross_entropy - self._default_metric = 'accuracy' + self._default_metric = "accuracy" def weight_updater(self, weights): """ @@ -54,14 +54,10 @@ def update_fit_params( for X, y in eval_set: check_output_dim(train_labels, y) self.output_dim = output_dim - self._default_metric = ('auc' if self.output_dim == 2 else 'accuracy') + self._default_metric = "auc" if self.output_dim == 2 else "accuracy" self.classes_ = train_labels - self.target_mapper = { - class_label: index for index, class_label in enumerate(self.classes_) - } - self.preds_mapper = { - str(index): class_label for index, class_label in enumerate(self.classes_) - } + self.target_mapper = {class_label: index for index, class_label in enumerate(self.classes_)} + self.preds_mapper = {str(index): class_label for index, class_label in enumerate(self.classes_)} self.updated_weights = self.weight_updater(weights) def stack_batches(self, list_y_true, list_y_score): @@ -117,9 +113,9 @@ def predict_proba(self, X): class TabNetRegressor(TabModel): def __post_init__(self): super(TabNetRegressor, self).__post_init__() - self._task = 'regression' + self._task = "regression" self._default_loss = torch.nn.functional.mse_loss - self._default_metric = 'mse' + self._default_metric = "mse" def prepare_target(self, y): return y @@ -127,17 +123,13 @@ def prepare_target(self, y): def compute_loss(self, y_pred, y_true): return self.loss_fn(y_pred, y_true) - def update_fit_params( - self, - X_train, - y_train, - eval_set, - weights - ): + def update_fit_params(self, X_train, y_train, eval_set, weights): if len(y_train.shape) != 2: - msg = "Targets should be 2D : (n_samples, n_regression) " + \ - f"but y_train.shape={y_train.shape} given.\n" + \ - "Use reshape(-1, 1) for single regression." + msg = ( + "Targets should be 2D : (n_samples, n_regression) " + + f"but y_train.shape={y_train.shape} given.\n" + + "Use reshape(-1, 1) for single regression." + ) raise ValueError(msg) self.output_dim = y_train.shape[1] self.preds_mapper = None diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py index fda3bfd4..52d15a72 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py @@ -121,9 +121,7 @@ def create_sampler(weights, y_train): sampler = None elif weights == 1: need_shuffle = False - class_sample_count = np.array( - [len(np.where(y_train == t)[0]) for t in np.unique(y_train)] - ) + class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)]) weights = 1.0 / class_sample_count @@ -149,9 +147,7 @@ def create_sampler(weights, y_train): return need_shuffle, sampler -def create_dataloaders( - X_train, y_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory -): +def create_dataloaders(X_train, y_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory): """ Create dataloaders with or without subsampling depending on weights and balanced. @@ -271,9 +267,7 @@ def create_explain_matrix(input_dim, cat_emb_dim, cat_idxs, post_embed_dim): if i not in cat_idxs: indices_trick.append([i + acc_emb]) else: - indices_trick.append( - range(i + acc_emb, i + acc_emb + all_emb_impact[nb_emb] + 1) - ) + indices_trick.append(range(i + acc_emb, i + acc_emb + all_emb_impact[nb_emb] + 1)) acc_emb += all_emb_impact[nb_emb] nb_emb += 1 @@ -420,43 +414,26 @@ def validate_eval_set(eval_set, eval_name, X_train, y_train): """ eval_name = eval_name or [f"val_{i}" for i in range(len(eval_set))] - assert len(eval_set) == len( - eval_name - ), "eval_set and eval_name have not the same length" + assert len(eval_set) == len(eval_name), "eval_set and eval_name have not the same length" if len(eval_set) > 0: - assert all( - len(elem) == 2 for elem in eval_set - ), "Each tuple of eval_set need to have two elements" + assert all(len(elem) == 2 for elem in eval_set), "Each tuple of eval_set need to have two elements" for name, (X, y) in zip(eval_name, eval_set): check_input(X) - msg = ( - f"Dimension mismatch between X_{name} " - + f"{X.shape} and X_train {X_train.shape}" - ) + msg = f"Dimension mismatch between X_{name} " + f"{X.shape} and X_train {X_train.shape}" assert len(X.shape) == len(X_train.shape), msg - msg = ( - f"Dimension mismatch between y_{name} " - + f"{y.shape} and y_train {y_train.shape}" - ) + msg = f"Dimension mismatch between y_{name} " + f"{y.shape} and y_train {y_train.shape}" assert len(y.shape) == len(y_train.shape), msg - msg = ( - f"Number of columns is different between X_{name} " - + f"({X.shape[1]}) and X_train ({X_train.shape[1]})" - ) + msg = f"Number of columns is different between X_{name} " + f"({X.shape[1]}) and X_train ({X_train.shape[1]})" assert X.shape[1] == X_train.shape[1], msg if len(y_train.shape) == 2: msg = ( - f"Number of columns is different between y_{name} " - + f"({y.shape[1]}) and y_train ({y_train.shape[1]})" + f"Number of columns is different between y_{name} " + f"({y.shape[1]}) and y_train ({y_train.shape[1]})" ) assert y.shape[1] == y_train.shape[1], msg - msg = ( - f"You need the same number of rows between X_{name} " - + f"({X.shape[0]}) and y_{name} ({y.shape[0]})" - ) + msg = f"You need the same number of rows between X_{name} " + f"({X.shape[0]}) and y_{name} ({y.shape[0]})" assert X.shape[0] == y.shape[0], msg return eval_name, eval_set From fbc5076f5aefe8b3d7a6ec0bed45b8afc570290d Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 15 Aug 2023 08:43:24 +0000 Subject: [PATCH 13/49] not done still --- lightautoml/ml_algo/tabnet/utils.py | 44 ++++++++--------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/lightautoml/ml_algo/tabnet/utils.py b/lightautoml/ml_algo/tabnet/utils.py index 9dad6259..332f0d89 100644 --- a/lightautoml/ml_algo/tabnet/utils.py +++ b/lightautoml/ml_algo/tabnet/utils.py @@ -2,7 +2,7 @@ import torch import numpy as np import torch.nn as nn -from lightautoml.ml_algo.torch_based.node_nn_model import Entmax15, Sparsemax, sparsemax,entmax15 +from lightautoml.ml_algo.torch_based.node_nn_model import Entmax15, Sparsemax, sparsemax, entmax15 from lightautoml.ml_algo.torch_based.autoint.ghost_norm import GhostBatchNorm @@ -20,9 +20,6 @@ def initialize_glu(module, input_dim, output_dim): return - - - class TabNetEncoder(torch.nn.Module): def __init__( self, @@ -100,13 +97,9 @@ def __init__( shared_feat_transform = torch.nn.ModuleList() for i in range(self.n_shared): if i == 0: - shared_feat_transform.append( - nn.Linear(self.input_dim, 2 * (n_d + n_a), bias=False) - ) + shared_feat_transform.append(nn.Linear(self.input_dim, 2 * (n_d + n_a), bias=False)) else: - shared_feat_transform.append( - nn.Linear(n_d + n_a, 2 * (n_d + n_a), bias=False) - ) + shared_feat_transform.append(nn.Linear(n_d + n_a, 2 * (n_d + n_a), bias=False)) else: shared_feat_transform = None @@ -155,9 +148,7 @@ def forward(self, x, prior=None): steps_output = [] for step in range(self.n_steps): M = self.att_transformers[step](prior, att) - M_loss += torch.mean( - torch.sum(torch.mul(M, torch.log(M + self.epsilon)), dim=1) - ) + M_loss += torch.mean(torch.sum(torch.mul(M, torch.log(M + self.epsilon)), dim=1)) # update prior prior = torch.mul(self.gamma - M, prior) # output @@ -197,7 +188,6 @@ def forward_masks(self, x): att = out[:, self.n_d :] return M_explain, masks - class FeatTransformer(torch.nn.Module): @@ -257,15 +247,13 @@ def __init__( self.specifics = torch.nn.Identity() else: spec_input_dim = input_dim if is_first else output_dim - self.specifics = GLU_Block( - spec_input_dim, output_dim, first=is_first, **params - ) + self.specifics = GLU_Block(spec_input_dim, output_dim, first=is_first, **params) def forward(self, x): x = self.shared(x) x = self.specifics(x) return x - + class GLU_Block(torch.nn.Module): """ @@ -308,13 +296,10 @@ def forward(self, x): x = torch.add(x, self.glu_layers[glu_id](x)) x = x * scale return x - class GLU_Layer(torch.nn.Module): - def __init__( - self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02 - ): + def __init__(self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02): super(GLU_Layer, self).__init__() self.output_dim = output_dim @@ -324,16 +309,13 @@ def __init__( self.fc = nn.Linear(input_dim, 2 * output_dim, bias=False) initialize_glu(self.fc, input_dim, 2 * output_dim) - self.bn = GhostBatchNorm( - 2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum - ) + self.bn = GhostBatchNorm(2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum) def forward(self, x): x = self.fc(x) x = self.bn(x) out = torch.mul(x[:, : self.output_dim], torch.sigmoid(x[:, self.output_dim :])) return out - class AttentiveTransformer(torch.nn.Module): @@ -365,9 +347,7 @@ def __init__( super(AttentiveTransformer, self).__init__() self.fc = nn.Linear(input_dim, group_dim, bias=False) initialize_non_glu(self.fc, input_dim, group_dim) - self.bn = GhostBatchNorm( - group_dim, virtual_batch_size=virtual_batch_size, momentum=momentum - ) + self.bn = GhostBatchNorm(group_dim, virtual_batch_size=virtual_batch_size, momentum=momentum) if mask_type == "sparsemax": # Sparsemax @@ -376,13 +356,11 @@ def __init__( # Entmax self.selector = Entmax15() else: - raise NotImplementedError( - "Please choose either sparsemax" + "or entmax as masktype" - ) + raise NotImplementedError("Please choose either sparsemax" + "or entmax as masktype") def forward(self, priors, processed_feat): x = self.fc(processed_feat) x = self.bn(x) x = torch.mul(x, priors) x = self.selector(x) - return x \ No newline at end of file + return x From 3a62edbf1f8aa5b09ff8c127f00f3ab482f97353 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 15 Aug 2023 08:46:31 +0000 Subject: [PATCH 14/49] -Lambda + MP --- .../ml_algo/torch_based/node_nn_model.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/lightautoml/ml_algo/torch_based/node_nn_model.py b/lightautoml/ml_algo/torch_based/node_nn_model.py index cdfedbea..d14ba601 100644 --- a/lightautoml/ml_algo/torch_based/node_nn_model.py +++ b/lightautoml/ml_algo/torch_based/node_nn_model.py @@ -258,26 +258,29 @@ def _backward(output, grad_output): entmoid15 = Entmoid15.apply # noqa: E731 -class Lambda(nn.Module): - """Pytorch implementation of lambda. +class MeanPooling(nn.Module): + """Pytorch implementation of MeanPooling head. Args: - func : returned func + n_out: int, output dim. + dim: int: the dimension to be averaged. + """ - def __init__(self, func): + def __init__(self, n_out, dim=-1): super().__init__() - self.func = func + self.n_out = n_out + self.dim = dim - def forward(self, *args, **kwargs): + def forward(self, x: torch.Tensor): """Forward-pass. # noqa: DAR101 Returns: - f(*args, **kwargs) + x[..., :self.n_out].mean(dim=self.dim) """ - return self.func(*args, **kwargs) + return x[..., :self.n_out].mean(dim=self.dim) class ModuleWithInit(nn.Module): From dc003fa67e66438cd70d40041915498cfb42ae8f Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 15 Aug 2023 10:19:03 +0000 Subject: [PATCH 15/49] changed on comments --- lightautoml/ml_algo/torch_based/nn_models.py | 5 +- .../ml_algo/torch_based/node_nn_model.py | 95 ++++++++++++++++--- 2 files changed, 85 insertions(+), 15 deletions(-) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index 119e0779..19ec5313 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -9,8 +9,7 @@ import torch import torch.nn as nn -from lightautoml.ml_algo.torch_based.node_nn_model import DenseODSTBlock -from lightautoml.ml_algo.torch_based.node_nn_model import Lambda +from lightautoml.ml_algo.torch_based.node_nn_model import DenseODSTBlock, MeanPooling class GaussianNoise(nn.Module): @@ -781,7 +780,7 @@ def __init__( self.features1.add_module("ODSTForestblock%d", block) self.features2 = nn.Sequential(OrderedDict([])) if use_original_head: - last_layer = Lambda(lambda x: x[..., :n_out].mean(dim=-2)) + last_layer = MeanPooling(n_out, dim=-2) self.features2.add_module("head", last_layer) else: if use_bn: diff --git a/lightautoml/ml_algo/torch_based/node_nn_model.py b/lightautoml/ml_algo/torch_based/node_nn_model.py index d14ba601..77ecc076 100644 --- a/lightautoml/ml_algo/torch_based/node_nn_model.py +++ b/lightautoml/ml_algo/torch_based/node_nn_model.py @@ -122,8 +122,41 @@ def _threshold_and_support(input, dim=-1): return tau, support_size -sparsemax = lambda input, dim=-1: SparsemaxFunction.apply(input, dim) # noqa: E731 -sparsemoid = lambda input: (0.5 * input + 0.5).clamp_(0, 1) # noqa: E731 +class Sparsemax(nn.Module): + """Py-Torch class for Sparsemax.""" + + def __init__(self): + super(Sparsemax, self).__init__() + + def forward(self, input, dim): + """Forward-pass. + + Args: + input (Tensor): input Tensor. + dim (int): dimension which will be aggregatedю + + Returns: + Entmax15(input,dim=dim) + """ + return SparsemaxFunction.apply(input, dim) + + +class Sparsemoid(nn.Module): + """Py-Torch class for Sparsemoid.""" + + def __init__(self): + super(Sparsemoid, self).__init__() + + def forward(self, input): + """Forward-pass. + + Args: + input (Tensor): input Tensor + + Returns: + Sparsemoid(input) + """ + return (0.5 * input + 0.5).clamp_(0, 1) class Entmax15Function(Function): @@ -204,8 +237,8 @@ def _threshold_and_support(input, dim=-1): return tau_star, support_size -class Entmoid15(Function): - """A highly optimized equivalent of labda x: Entmax15([x, 0]).""" +class Entmoid15Optimied(Function): + """A highly optimized equivalent of lambda x: Entmax15([x, 0]).""" @staticmethod def forward(ctx, input): @@ -218,7 +251,7 @@ def forward(ctx, input): Returns: output (Tensor): same shape as input """ - output = Entmoid15._forward(input) + output = Entmoid15Optimied._forward(input) ctx.save_for_backward(output) return output @@ -242,7 +275,7 @@ def backward(ctx, grad_output): Returns: grad output """ - return Entmoid15._backward(ctx.saved_tensors[0], grad_output) + return Entmoid15Optimied._backward(ctx.saved_tensors[0], grad_output) @staticmethod @script @@ -254,8 +287,41 @@ def _backward(output, grad_output): return grad_input -entmax15 = lambda input, dim=-1: Entmax15Function.apply(input, dim) # noqa: E731 -entmoid15 = Entmoid15.apply # noqa: E731 +class Entmax15(nn.Module): + """Py-Torch class for Entmax15.""" + + def __init__(self): + super(Entmax15, self).__init__() + + def forward(self, input, dim): + """Forward-pass. + + Args: + input (Tensor): input Tensor. + dim (int): dimension which will be aggregatedю + + Returns: + Entmax15(input,dim=dim) + """ + return Entmax15Function.apply(input, dim) + + +class Entmoid15(nn.Module): + """Py-Torch class for Entmoid15.""" + + def __init__(self): + super(Entmoid15, self).__init__() + + def forward(self, input): + """Forward-pass. + + Args: + input (Tensor): input Tensor + + Returns: + Entmoid15(input) + """ + return Entmoid15Optimied.apply(input) class MeanPooling(nn.Module): @@ -264,7 +330,7 @@ class MeanPooling(nn.Module): Args: n_out: int, output dim. dim: int: the dimension to be averaged. - + """ def __init__(self, n_out, dim=-1): @@ -280,7 +346,7 @@ def forward(self, x: torch.Tensor): Returns: x[..., :self.n_out].mean(dim=self.dim) """ - return x[..., :self.n_out].mean(dim=self.dim) + return x[..., : self.n_out].mean(dim=self.dim) class ModuleWithInit(nn.Module): @@ -355,8 +421,8 @@ def __init__( depth=6, tree_dim=1, flatten_output=True, - choice_function=entmax15, - bin_function=entmoid15, + choice_function=Entmax15(), + bin_function=Entmoid15(), initialize_response_=nn.init.normal_, initialize_selection_logits_=nn.init.uniform_, threshold_init_beta=1.0, @@ -523,6 +589,11 @@ def forward(self, x): tail_features = min(self.max_features, layer_inp.shape[-1]) - initial_features if tail_features != 0: layer_inp = torch.cat([layer_inp[..., :initial_features], layer_inp[..., -tail_features:]], dim=-1) + """ + Originally it was: + if self.training and self.input_dropout: + layer_inp = F.dropout(layer_inp, self.input_dropout) + """ if self.input_dropout: layer_inp = F.dropout(layer_inp, self.input_dropout, self.training) h = layer(layer_inp) From 32bae0ae1b5abb786725fc910ec2377e51f17a9e Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 15 Aug 2023 10:37:55 +0000 Subject: [PATCH 16/49] changes on comments --- lightautoml/ml_algo/torch_based/linear_model.py | 4 ++-- lightautoml/ml_algo/utils.py | 4 ++-- pyproject.toml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lightautoml/ml_algo/torch_based/linear_model.py b/lightautoml/ml_algo/torch_based/linear_model.py index 6321caf5..3fcfc96e 100644 --- a/lightautoml/ml_algo/torch_based/linear_model.py +++ b/lightautoml/ml_algo/torch_based/linear_model.py @@ -16,7 +16,7 @@ from torch import optim from ...tasks.losses import TorchLossWrapper -from ..utils import MySoftmaxClip +from ..utils import SoftmaxClip logger = logging.getLogger(__name__) @@ -138,7 +138,7 @@ class CatMulticlass(CatLinear): def __init__(self, numeric_size: int, embed_sizes: Sequence[int] = (), output_size: int = 1): super().__init__(numeric_size, embed_sizes=embed_sizes, output_size=output_size) - self.final_act = MySoftmaxClip(dim=1) + self.final_act = SoftmaxClip(dim=1) class TorchBasedLinearEstimator: diff --git a/lightautoml/ml_algo/utils.py b/lightautoml/ml_algo/utils.py index 3cbbd5e1..f142f3f8 100644 --- a/lightautoml/ml_algo/utils.py +++ b/lightautoml/ml_algo/utils.py @@ -83,7 +83,7 @@ def tune_and_fit_predict( return ml_algo, preds -class MySoftmaxClip(nn.Module): +class SoftmaxClip(nn.Module): """Softmax with clip-norm. Args: @@ -92,7 +92,7 @@ class MySoftmaxClip(nn.Module): """ def __init__(self, dim: Optional[int] = None) -> None: - super(MySoftmaxClip, self).__init__() + super(SoftmaxClip, self).__init__() self.dim = dim self.smax = nn.Softmax(dim=dim) diff --git a/pyproject.toml b/pyproject.toml index b78a6714..352a0cad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ catboost = ">=0.26.1" optuna = "*" torch = [ {platform = "win32", python = "3.6.1", version = "1.7.0"}, - {version = "<=2.0.0"} + {version = ">=1.9.0, <=2.0.0"} ] dataclasses = {version = "0.6", python = "<3.7"} holidays = "*" From a58df59b5bc18b20c6696f0f54903078ca495c84 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 15 Aug 2023 12:14:47 +0000 Subject: [PATCH 17/49] added changes on comments --- lightautoml/automl/presets/image_config.yml | 4 + lightautoml/automl/presets/tabular_config.yml | 6 +- lightautoml/automl/presets/text_config.yml | 4 + .../automl/presets/time_series_config.yml | 4 + lightautoml/ml_algo/dl_model.py | 86 +++++++++++++------ .../torch_based/autoint/autoint_utils.py | 47 ---------- lightautoml/text/embed.py | 79 +++++------------ lightautoml/text/nn_model.py | 12 +-- 8 files changed, 106 insertions(+), 136 deletions(-) diff --git a/lightautoml/automl/presets/image_config.yml b/lightautoml/automl/presets/image_config.yml index 01c04b30..2ba5b4c7 100755 --- a/lightautoml/automl/presets/image_config.yml +++ b/lightautoml/automl/presets/image_config.yml @@ -244,6 +244,10 @@ nn_params: model: denselight # embedding_size if needed embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # use model with custom embeddings model_with_emb: false # tune custom network diff --git a/lightautoml/automl/presets/tabular_config.yml b/lightautoml/automl/presets/tabular_config.yml index 691609f4..bcf6abbd 100755 --- a/lightautoml/automl/presets/tabular_config.yml +++ b/lightautoml/automl/presets/tabular_config.yml @@ -130,6 +130,10 @@ nn_params: model: denselight # embedding_size if needed embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # use model with custom embeddings model_with_emb: false # tune custom network @@ -156,7 +160,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 256, 128, 64] + hidden_size: [512, 256, 128] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block diff --git a/lightautoml/automl/presets/text_config.yml b/lightautoml/automl/presets/text_config.yml index 14d9c3f1..db82a874 100755 --- a/lightautoml/automl/presets/text_config.yml +++ b/lightautoml/automl/presets/text_config.yml @@ -122,6 +122,10 @@ linear_l2_params: nn_params: # embedding_size if needed embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # early stopping and scheduler use metric stop_by_metric: False random_state: 42 diff --git a/lightautoml/automl/presets/time_series_config.yml b/lightautoml/automl/presets/time_series_config.yml index 2e4cbdb7..8bb055ae 100644 --- a/lightautoml/automl/presets/time_series_config.yml +++ b/lightautoml/automl/presets/time_series_config.yml @@ -134,6 +134,10 @@ nn_params: model: denselight # embedding_size if needed embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # use model with custom embeddings model_with_emb: false # tune custom network diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 49be49f0..d7b1ed0c 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -44,7 +44,17 @@ from ..ml_algo.base import TabularDataset from ..ml_algo.base import TabularMLAlgo from ..pipelines.utils import get_columns_by_role -from ..text.embed import CatEmbedder, DefaultEmbedding, DenseEmbedding, LinearEmbedding, BasicEmbedding +from ..text.embed import ( + BasicCatEmbeddingFlat, + CatEmbedder, + DenseEmbedding, + DenseEmbeddingFlat, + LinearEmbedding, + LinearEmbeddingFlat, + WeightedCatEmbedding, + BasicCatEmbedding, + WeightedCatEmbeddingFlat, +) from ..text.embed import ContEmbedder from ..text.embed import TextBert from ..text.nn_model import TorchUniversalModel @@ -79,32 +89,56 @@ "snn": SNN, "node": NODE, "autoint": AutoInt, - "autoint_emb_v2": AutoInt, } -cat_embedder_by_name = { - "denselight": CatEmbedder, - "dense": CatEmbedder, - "resnet": CatEmbedder, - "mlp": CatEmbedder, - "linear_layer": CatEmbedder, - "_linear_layer": CatEmbedder, - "snn": CatEmbedder, - "node": CatEmbedder, - "autoint": BasicEmbedding, - "autoint_emb_v2": DefaultEmbedding, +input_type_by_name = { + "denselight": "flat", + "dense": "flat", + "resnet": "flat", + "mlp": "flat", + "linear_layer": "flat", + "_linear_layer": "flat", + "snn": "flat", + "node": "flat", + "autoint": "seq", } -cont_embedder_params_by_name = { - "denselight": ContEmbedder, - "dense": ContEmbedder, - "resnet": ContEmbedder, - "mlp": ContEmbedder, - "linear_layer": ContEmbedder, - "_linear_layer": ContEmbedder, - "snn": ContEmbedder, - "node": ContEmbedder, - "autoint": LinearEmbedding, - "autoint_emb_v2": DenseEmbedding, +cat_embedder_by_name_flat = { + "cat": CatEmbedder, + "cat_no_dropout": BasicCatEmbeddingFlat, + "weighted": WeightedCatEmbeddingFlat, } +cat_embedder_by_name = {"cat_no_dropout": BasicCatEmbedding, "weighted": WeightedCatEmbedding} +cont_embedder_by_name_flat = {"cont": ContEmbedder, "linear": LinearEmbeddingFlat, "dense": DenseEmbeddingFlat} +cont_embedder_by_name = {"linear": LinearEmbedding, "dense": DenseEmbedding} + + +def _get_embedder_cat(params): + if input_type_by_name[params["model"]] == "seq": + try: + out = cat_embedder_by_name[params["cat_embedder"]] + except KeyError: + out = BasicCatEmbedding + return out + else: + try: + out = cat_embedder_by_name_flat[params["cat_embedder"]] + except KeyError: + out = CatEmbedder + return out + + +def _get_embedder_cont(params): + if input_type_by_name[params["model"]] == "seq": + try: + out = cont_embedder_by_name[params["cont_embedder"]] + except KeyError: + out = LinearEmbedding + return out + else: + try: + out = cont_embedder_by_name_flat[params["cont_embedder"]] + except KeyError: + out = ContEmbedder + return out class TorchModel(TabularMLAlgo): @@ -278,7 +312,7 @@ def _infer_params(self): net=TorchUniversalModel if not params["model_with_emb"] else params["model"], net_params={ "task": self.task, - "cont_embedder": cont_embedder_params_by_name[params["model"]] if is_cont else None, + "cont_embedder_": _get_embedder_cont(params) if is_cont else None, "cont_params": { "num_dims": params["num_dims"], "input_bn": params["input_bn"], @@ -287,7 +321,7 @@ def _infer_params(self): } if is_cont else None, - "cat_embedder": cat_embedder_by_name[params["model"]] if is_cat else None, + "cat_embedder_": _get_embedder_cat(params) if is_cat else None, "cat_params": { "cat_vc": params["cat_vc"], "cat_dims": params["cat_dims"], diff --git a/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py b/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py index c14944f5..ba047d52 100644 --- a/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py +++ b/lightautoml/ml_algo/torch_based/autoint/autoint_utils.py @@ -12,53 +12,6 @@ EmbeddingInfo = namedtuple("EmbeddingInfo", ["num_fields", "output_size"]) UniformEmbeddingInfo = namedtuple("EmbeddingInfo", ["num_fields", "embedding_size", "output_size"]) -MODULE_INIT_DOC = """ -Parameters ----------- -output_size : int - number of final output values; i.e., number of targets for - regression or number of classes for classification -embedding_num : EmbeddingBase or None - initialized and fit embedding for numeric fields -embedding_cat : EmbeddingBase or None - initialized and fit embedding for categorical fields -embedding_l1_reg : float, optional - value for l1 regularization of embedding vectors; default is 0.0 -embedding_l2_reg : float, optional - value for l2 regularization of embedding vectors; default is 0.0 -{} -mlp_hidden_sizes : int or iterable of int, optional - sizes for the linear transformations between the MLP input and - the output size needed based on the target; default is (512, 256, 128, 64) -mlp_activation : subclass of torch.nn.Module (uninitialized), optional - default is nn.LeakyReLU -mlp_use_bn : boolean, optional - whether to use batch normalization between MLP linear layers; - default is True -mlp_bn_momentum : float, optional - only used if `mlp_use_bn` is True; default is 0.01 -mlp_ghost_batch : int or None, optional - only used if `mlp_use_bn` is True; size of batch in "ghost batch norm"; - if None, normal batch norm is used; defualt is None -mlp_dropout : float, optional - whether and how much dropout to use between MLP linear layers; - `0.0 <= mlp_dropout < 1.0`; default is 0.0 -mlp_use_skip : boolean, optional - use a side path in the MLP containing just the optional leaky gate - plus single linear layer; default is True -mlp_l1_reg : float, optional - value for l1 regularization of MLP weights; default is 0.0 -mlp_l2_reg : float, optional - value for l2 regularization of MLP weights; default is 0.0 -use_leaky_gate : boolean, optional - whether to include "leaky gate" layers; default is True -loss_fn : "auto" or PyTorch loss function, optional - default is "auto" -device : string or torch.device, optional - default is "cpu" - -""" - class LeakyGate(nn.Module): """LeakyGate from https://github.com/jrfiedler/xynn. diff --git a/lightautoml/text/embed.py b/lightautoml/text/embed.py index eaa15558..650a86e6 100644 --- a/lightautoml/text/embed.py +++ b/lightautoml/text/embed.py @@ -8,7 +8,6 @@ from typing import Sequence from typing import Union from functools import reduce -import numpy as np import torch import torch.nn as nn from torch import Tensor @@ -167,9 +166,11 @@ def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: return output -class BasicEmbedding(nn.Module): +class BasicCatEmbedding(nn.Module): """A basic embedding that creates an embedded vector for each field value from https://github.com/jrfiedler/xynn. + The same as CatEmbedder, but without dropout, and it can be presented as a sequance. + Args: embedding_size : int, optional size of each value's embedding vector; default is 10 @@ -180,7 +181,7 @@ class BasicEmbedding(nn.Module): def __init__( self, - cat_vc: Sequence[Dict], + cat_dims: Sequence[int], embedding_size: int = 10, device: Union[str, torch.device] = "cuda:0", flatten_output: bool = False, @@ -189,37 +190,22 @@ def __init__( super().__init__() self.flatten_output = flatten_output self._device = device - self._isfit = False self.num_fields = 0 self.output_size = 0 - self.lookup: Dict[Tuple[int, Any], int] = {} - self.lookup_nan: Dict[int, int] = {} - self.num_values = 0 self.embedding: Optional[nn.Embedding] = None self.embedding_size = embedding_size - self._from_summary(cat_vc) - self.cat_len = len(cat_vc) + self._from_summary(cat_dims) + self.cat_len = len(cat_dims) - def _from_summary(self, uniques: List[Union[List, Tensor, np.ndarray]]): - lookup = {} - lookup_nan = {} + def _from_summary(self, cat_dims: Sequence[int]): num_values = 0 - for fieldnum, field in enumerate(uniques): - for value in field: - if (fieldnum, value) in lookup: - # extra defense against repeated values - continue - lookup[(fieldnum, value)] = num_values - num_values += 1 - self.num_fields = len(uniques) + self.emb_layers = nn.ModuleList([nn.Embedding(int(x), self.embedding_size) for x in cat_dims]) + self.num_fields = len(cat_dims) self.output_size = self.num_fields * self.embedding_size - self.lookup = lookup - self.lookup_nan = lookup_nan self.num_values = num_values - self.embedding = nn.Embedding(num_values, self.embedding_size) - nn.init.xavier_uniform_(self.embedding.weight) - self._isfit = True + for emb in self.emb_layers: + nn.init.xavier_uniform_(emb.weight) def get_out_shape(self) -> int: """Output shape. @@ -243,23 +229,17 @@ def forward(self, X: Dict) -> Tensor: torch.Tensor """ - if not self._isfit: - raise RuntimeError("need to call `fit` or `from_summary` first") X = X["cat"] - idxs: List[List[int]] = [] - for row in X: - idxs.append([]) - for col, val in enumerate(row): - val = val.item() - idx = self.lookup[(col, val)] - idxs[-1].append(idx) - x = self.embedding(torch.tensor(idxs, dtype=torch.int64, device=self._device)) + x = torch.stack( + [emb_layer(X[:, i]) for i, emb_layer in enumerate(self.emb_layers)], + dim=1, + ) if self.flatten_output: return x.view(x.shape[0], -1) return x -class DefaultEmbedding(nn.Module): +class WeightedCatEmbedding(nn.Module): """DefaultEmbedding from https://github.com/jrfiedler/xynn. An embedding with a default value for each field. The default is returned for @@ -295,7 +275,6 @@ def __init__( ): super().__init__() self.flatten_output = flatten_output - self._isfit = False self._device = device self.num_fields = 0 self.output_size = 0 @@ -327,8 +306,6 @@ def _from_summary(self, unique_counts: List[Dict[Any, int]]): self.embedding = nn.Embedding(num_values, self.embedding_size) nn.init.xavier_uniform_(self.embedding.weight) - self._isfit = True - def get_out_shape(self) -> int: """Output shape. @@ -350,8 +327,6 @@ def forward(self, X: Dict) -> Tensor: Returns: torch.Tensor """ - if not self._isfit: - raise RuntimeError("need to call `fit` or `from_summary` first") X = X["cat"] list_weights: List[List[List[float]]] = [] idxs_primary: List[List[int]] = [] @@ -393,7 +368,6 @@ class LinearEmbedding(nn.Module): def __init__(self, num_dims: int, embedding_size: int = 10, flatten_output: bool = False, **kwargs): super().__init__() self.flatten_output = flatten_output - self._isfit = False self.num_fields = num_dims self.output_size = 0 self.embedding: Optional[nn.Embedding] = None @@ -405,7 +379,6 @@ def _from_summary(self, num_fields: int): self.output_size = num_fields * self.embedding_size self.embedding = nn.Embedding(num_fields, self.embedding_size) nn.init.xavier_uniform_(self.embedding.weight) - self._isfit = True def get_out_shape(self) -> int: """Output shape. @@ -430,8 +403,6 @@ def forward(self, X: Dict) -> Tensor: """ X = X["cont"] - if not self._isfit: - raise RuntimeError("need to call `fit` or `from_summary` first") x = self.embedding.weight * X.unsqueeze(dim=-1) if self.flatten_output: return x.view(x.shape[0], -1) @@ -468,7 +439,6 @@ def __init__( embedding_size = (1, embedding_size) elif len(embedding_size) == 1: embedding_size = (1, embedding_size[0]) - self._isfit = False self.num_fields = num_dims self.output_size = 0 self.embedding_w = None @@ -483,7 +453,6 @@ def _from_summary(self, num_fields: int): self.embedding_w = nn.Parameter(torch.zeros((num_fields, *self.dense_out_size))) self.embedding_b = nn.Parameter(torch.zeros(self.dense_out_size)) nn.init.xavier_uniform_(self.embedding_w) - self._isfit = True def get_out_shape(self) -> int: """Output shape. @@ -508,9 +477,7 @@ def forward(self, X: Dict) -> Tensor: """ X = X["cont"] - if not self._isfit: - raise RuntimeError("need to call `fit` or `from_summary` first") - embedded = self.embedding_w.T.matmul(X.T.to(dtype=torch.float)).T + self.embedding_b + embedded = self.embedding_w.T.matmul(X.T.float()).T + self.embedding_b embedded = self.activation(embedded.reshape((X.shape[0], -1))) x = embedded.reshape((X.shape[0], *self.dense_out_size)) if self.flatten_output: @@ -532,15 +499,15 @@ def __init__(self, *args, **kwargs): super(LinearEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) -class DefaultEmbeddingFlat(DefaultEmbedding): - """Flatten version of DefaultEmbedding.""" +class WeightedCatEmbeddingFlat(WeightedCatEmbedding): + """Flatten version of WeightedCatEmbedding.""" def __init__(self, *args, **kwargs): - super(DefaultEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) + super(WeightedCatEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) -class BasicEmbeddingFlat(BasicEmbedding): - """Flatten version of BasicEmbedding.""" +class BasicCatEmbeddingFlat(BasicCatEmbedding): + """Flatten version of BasicCatEmbedding.""" def __init__(self, *args, **kwargs): - super(BasicEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) + super(BasicCatEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) diff --git a/lightautoml/text/nn_model.py b/lightautoml/text/nn_model.py index 916cfec6..dc4db2ae 100644 --- a/lightautoml/text/nn_model.py +++ b/lightautoml/text/nn_model.py @@ -114,9 +114,9 @@ def __init__( task: Task, torch_model: nn.Module, n_out: int = 1, - cont_embedder: Optional[Any] = None, + cont_embedder_: Optional[Any] = None, cont_params: Optional[Dict] = None, - cat_embedder: Optional[Any] = None, + cat_embedder_: Optional[Any] = None, cat_params: Optional[Dict] = None, text_embedder: Optional[Any] = None, text_params: Optional[Dict] = None, @@ -135,11 +135,11 @@ def __init__( self.text_embedder = None n_in = 0 - if cont_embedder is not None: - self.cont_embedder = cont_embedder(**cont_params) + if cont_embedder_ is not None: + self.cont_embedder = cont_embedder_(**cont_params) n_in += self.cont_embedder.get_out_shape() - if cat_embedder is not None: - self.cat_embedder = cat_embedder(**cat_params) + if cat_embedder_ is not None: + self.cat_embedder = cat_embedder_(**cat_params) n_in += self.cat_embedder.get_out_shape() if text_embedder is not None: self.text_embedder = text_embedder(**text_params) From 788d381e616f220c1ac7b44611b8a867ecf9b7b2 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 15 Aug 2023 12:24:38 +0000 Subject: [PATCH 18/49] resolve merge conflicts --- lightautoml/ml_algo/torch_based/nn_models.py | 2 +- lightautoml/ml_algo/torch_based/node_nn_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index 92ed10ee..0988e0b1 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -1091,4 +1091,4 @@ def forward(self, x): return out def forward_masks(self, x): - return self.encoder.forward_masks(x) \ No newline at end of file + return self.encoder.forward_masks(x) diff --git a/lightautoml/ml_algo/torch_based/node_nn_model.py b/lightautoml/ml_algo/torch_based/node_nn_model.py index f0f195af..5259f24f 100644 --- a/lightautoml/ml_algo/torch_based/node_nn_model.py +++ b/lightautoml/ml_algo/torch_based/node_nn_model.py @@ -357,7 +357,7 @@ class MeanPooling(nn.Module): n_out: int, output dim. dim: int: the dimension to be averaged. <<<<<<< HEAD - + ======= >>>>>>> autoint++ From f6fdb58b49510c01fb0ac8e2308c56529d74ad13 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 15 Aug 2023 12:27:16 +0000 Subject: [PATCH 19/49] resolve merge conflicts --- lightautoml/ml_algo/torch_based/nn_models.py | 4 --- .../ml_algo/torch_based/node_nn_model.py | 29 ------------------- 2 files changed, 33 deletions(-) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index 0988e0b1..f8bfbea1 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -840,11 +840,7 @@ def __init__( self.features1.add_module("ODSTForestblock%d", block) self.features2 = nn.Sequential(OrderedDict([])) if use_original_head: -<<<<<<< HEAD - last_layer = MeanPooling(n_out,dim=-2) -======= last_layer = MeanPooling(n_out, dim=-2) ->>>>>>> autoint++ self.features2.add_module("head", last_layer) else: if use_bn: diff --git a/lightautoml/ml_algo/torch_based/node_nn_model.py b/lightautoml/ml_algo/torch_based/node_nn_model.py index 5259f24f..9bf02d44 100644 --- a/lightautoml/ml_algo/torch_based/node_nn_model.py +++ b/lightautoml/ml_algo/torch_based/node_nn_model.py @@ -122,11 +122,6 @@ def _threshold_and_support(input, dim=-1): return tau, support_size -<<<<<<< HEAD -sparsemax = lambda input, dim=-1: SparsemaxFunction.apply(input, dim) # noqa: E731 -sparsemoid = lambda input: (0.5 * input + 0.5).clamp_(0, 1) # noqa: E731 -class Sparsemax(nn.Module): -======= class Sparsemax(nn.Module): """Py-Torch class for Sparsemax.""" @@ -162,7 +157,6 @@ def forward(self, input): Sparsemoid(input) """ return (0.5 * input + 0.5).clamp_(0, 1) ->>>>>>> autoint++ def __init__(self, dim=-1): self.dim = dim @@ -299,19 +293,6 @@ def _backward(output, grad_output): return grad_input -<<<<<<< HEAD -entmax15 = lambda input, dim=-1: Entmax15Function.apply(input, dim) # noqa: E731 -entmoid15 = Entmoid15.apply # noqa: E731 -class Entmax15(nn.Module): - - def __init__(self, dim=-1): - self.dim = dim - super(Entmax15, self).__init__() - - def forward(self, input): - return Entmax15Function.apply(input, self.dim) - -======= class Entmax15(nn.Module): """Py-Torch class for Entmax15.""" @@ -349,18 +330,12 @@ def forward(self, input): return Entmoid15Optimied.apply(input) ->>>>>>> autoint++ class MeanPooling(nn.Module): """Pytorch implementation of MeanPooling head. Args: n_out: int, output dim. dim: int: the dimension to be averaged. -<<<<<<< HEAD - -======= - ->>>>>>> autoint++ """ def __init__(self, n_out, dim=-1): @@ -376,11 +351,7 @@ def forward(self, x: torch.Tensor): Returns: x[..., :self.n_out].mean(dim=self.dim) """ -<<<<<<< HEAD - return x[..., :self.n_out].mean(dim=self.dim) -======= return x[..., : self.n_out].mean(dim=self.dim) ->>>>>>> autoint++ class ModuleWithInit(nn.Module): From a7fe9be2694092dc66547fa7feacbef2d530d97e Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Wed, 16 Aug 2023 11:57:35 +0000 Subject: [PATCH 20/49] PLR + SOFTEmb --- lightautoml/automl/presets/tabular_presets.py | 2 +- lightautoml/ml_algo/dl_model.py | 17 +- lightautoml/ml_algo/tabnet/utils.py | 2 +- lightautoml/ml_algo/torch_based/nn_models.py | 4 +- .../ml_algo/torch_based/node_nn_model.py | 18 +- .../pytorch_tabnet/abstract_model.py | 57 ++---- .../torch_based/pytorch_tabnet/tab_network.py | 72 +++---- lightautoml/text/embed.py | 191 ++++++++++++++++++ 8 files changed, 254 insertions(+), 109 deletions(-) diff --git a/lightautoml/automl/presets/tabular_presets.py b/lightautoml/automl/presets/tabular_presets.py index 539b2df4..cac85a0e 100755 --- a/lightautoml/automl/presets/tabular_presets.py +++ b/lightautoml/automl/presets/tabular_presets.py @@ -609,7 +609,7 @@ def create_automl(self, **fit_args): "node", "autoint", "autoint_emb_v2", - "tabnet" + "tabnet", ] available_nn_models = available_nn_models + [x + "_tuned" for x in available_nn_models] nn_models = [ diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index b3124c0e..6f49e633 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -51,6 +51,10 @@ DenseEmbeddingFlat, LinearEmbedding, LinearEmbeddingFlat, + PLREmbedding, + PLREmbeddingFlat, + SoftEmbedding, + SoftEmbeddingFlat, WeightedCatEmbedding, BasicCatEmbedding, WeightedCatEmbeddingFlat, @@ -89,6 +93,7 @@ "snn": SNN, "node": NODE, "autoint": AutoInt, + "tabnet": TabNet, } input_type_by_name = { "denselight": "flat", @@ -100,6 +105,7 @@ "snn": "flat", "node": "flat", "autoint": "seq", + "tabnet": "flat", } cat_embedder_by_name_flat = { "cat": CatEmbedder, @@ -107,8 +113,15 @@ "weighted": WeightedCatEmbeddingFlat, } cat_embedder_by_name = {"cat_no_dropout": BasicCatEmbedding, "weighted": WeightedCatEmbedding} -cont_embedder_by_name_flat = {"cont": ContEmbedder, "linear": LinearEmbeddingFlat, "dense": DenseEmbeddingFlat} -cont_embedder_by_name = {"linear": LinearEmbedding, "dense": DenseEmbedding} + +cont_embedder_by_name_flat = { + "cont": ContEmbedder, + "linear": LinearEmbeddingFlat, + "dense": DenseEmbeddingFlat, + "plr": PLREmbeddingFlat, + "soft": SoftEmbeddingFlat, +} +cont_embedder_by_name = {"linear": LinearEmbedding, "dense": DenseEmbedding, "plr": PLREmbedding, "soft": SoftEmbedding} def _get_embedder_cat(params): diff --git a/lightautoml/ml_algo/tabnet/utils.py b/lightautoml/ml_algo/tabnet/utils.py index 332f0d89..40845a8b 100644 --- a/lightautoml/ml_algo/tabnet/utils.py +++ b/lightautoml/ml_algo/tabnet/utils.py @@ -2,7 +2,7 @@ import torch import numpy as np import torch.nn as nn -from lightautoml.ml_algo.torch_based.node_nn_model import Entmax15, Sparsemax, sparsemax, entmax15 +from lightautoml.ml_algo.torch_based.node_nn_model import Entmax15, Sparsemax from lightautoml.ml_algo.torch_based.autoint.ghost_norm import GhostBatchNorm diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index f8bfbea1..b00fe220 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -978,8 +978,6 @@ def forward(self, embedded: torch.Tensor) -> torch.Tensor: return out - - class TabNet(torch.nn.Module): def __init__( self, @@ -1059,7 +1057,7 @@ def __init__( virtual_batch_size=virtual_batch_size, momentum=momentum, mask_type=mask_type, - group_attention_matrix=group_attention_matrix + group_attention_matrix=group_attention_matrix, ) if self.is_multi_task: diff --git a/lightautoml/ml_algo/torch_based/node_nn_model.py b/lightautoml/ml_algo/torch_based/node_nn_model.py index 9bf02d44..e57f5125 100644 --- a/lightautoml/ml_algo/torch_based/node_nn_model.py +++ b/lightautoml/ml_algo/torch_based/node_nn_model.py @@ -128,7 +128,7 @@ class Sparsemax(nn.Module): def __init__(self): super(Sparsemax, self).__init__() - def forward(self, input, dim): + def forward(self, input, dim=-1): """Forward-pass. Args: @@ -158,12 +158,6 @@ def forward(self, input): """ return (0.5 * input + 0.5).clamp_(0, 1) - def __init__(self, dim=-1): - self.dim = dim - super(Sparsemax, self).__init__() - - def forward(self, input): - return SparsemaxFunction.apply(input, self.dim) class Entmax15Function(Function): """An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). @@ -243,7 +237,7 @@ def _threshold_and_support(input, dim=-1): return tau_star, support_size -class Entmoid15Optimied(Function): +class Entmoid15Optimized(Function): """A highly optimized equivalent of lambda x: Entmax15([x, 0]).""" @staticmethod @@ -257,7 +251,7 @@ def forward(ctx, input): Returns: output (Tensor): same shape as input """ - output = Entmoid15Optimied._forward(input) + output = Entmoid15Optimized._forward(input) ctx.save_for_backward(output) return output @@ -281,7 +275,7 @@ def backward(ctx, grad_output): Returns: grad output """ - return Entmoid15Optimied._backward(ctx.saved_tensors[0], grad_output) + return Entmoid15Optimized._backward(ctx.saved_tensors[0], grad_output) @staticmethod @script @@ -299,7 +293,7 @@ class Entmax15(nn.Module): def __init__(self): super(Entmax15, self).__init__() - def forward(self, input, dim): + def forward(self, input, dim=-1): """Forward-pass. Args: @@ -327,7 +321,7 @@ def forward(self, input): Returns: Entmoid15(input) """ - return Entmoid15Optimied.apply(input) + return Entmoid15Optimized.apply(input) class MeanPooling(nn.Module): diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py index a1734439..76c4de53 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py @@ -17,7 +17,7 @@ check_input, check_warm_start, create_group_matrix, - check_embedding_parameters + check_embedding_parameters, ) from pytorch_tabnet.callbacks import ( CallbackContainer, @@ -85,9 +85,7 @@ def __post_init__(self): self.optimizer_fn = copy.deepcopy(self.optimizer_fn) self.scheduler_fn = copy.deepcopy(self.scheduler_fn) - updated_params = check_embedding_parameters(self.cat_dims, - self.cat_idxs, - self.cat_emb_dim) + updated_params = check_embedding_parameters(self.cat_dims, self.cat_idxs, self.cat_emb_dim) self.cat_dims, self.cat_idxs, self.cat_emb_dim = updated_params def __update__(self, **kwargs): @@ -140,7 +138,7 @@ def fit( from_unsupervised=None, warm_start=False, augmentations=None, - compute_importance=True + compute_importance=True, ): """Train a neural network stored in self.network Using train_dataloader for training data and @@ -227,9 +225,7 @@ def fit( # Validate and reformat eval set depending on training data eval_names, eval_set = validate_eval_set(eval_set, eval_name, X_train, y_train) - train_dataloader, valid_dataloaders = self._construct_loaders( - X_train, y_train, eval_set - ) + train_dataloader, valid_dataloaders = self._construct_loaders(X_train, y_train, eval_set) if from_unsupervised is not None: # Update parameters to match self pretraining @@ -262,9 +258,7 @@ def fit( self._predict_epoch(eval_name, valid_dataloader) # Call method on_epoch_end for all callbacks - self._callback_container.on_epoch_end( - epoch_idx, logs=self.history.epoch_metrics - ) + self._callback_container.on_epoch_end(epoch_idx, logs=self.history.epoch_metrics) if self._stop_training: break @@ -355,11 +349,8 @@ def explain(self, X, normalize=False): M_explain, masks = self.network.forward_masks(data) for key, value in masks.items(): - masks[key] = csc_matrix.dot( - value.cpu().detach().numpy(), self.reducing_matrix - ) - original_feat_explain = csc_matrix.dot(M_explain.cpu().detach().numpy(), - self.reducing_matrix) + masks[key] = csc_matrix.dot(value.cpu().detach().numpy(), self.reducing_matrix) + original_feat_explain = csc_matrix.dot(M_explain.cpu().detach().numpy(), self.reducing_matrix) res_explain.append(original_feat_explain) if batch_nb == 0: @@ -417,9 +408,7 @@ def save_model(self, path): init_params[key] = val saved_params["init_params"] = init_params - class_attrs = { - "preds_mapper": self.preds_mapper - } + class_attrs = {"preds_mapper": self.preds_mapper} saved_params["class_attrs"] = class_attrs # Create folder @@ -645,9 +634,7 @@ def _set_metrics(self, metrics, eval_names): # Set metric container for each sets self._metric_container_dict = {} for name in eval_names: - self._metric_container_dict.update( - {name: MetricContainer(metrics, prefix=f"{name}_")} - ) + self._metric_container_dict.update({name: MetricContainer(metrics, prefix=f"{name}_")}) self._metrics = [] self._metrics_names = [] @@ -656,9 +643,7 @@ def _set_metrics(self, metrics, eval_names): self._metrics_names.extend(metric_container.names) # Early stopping metric is the last eval metric - self.early_stopping_metric = ( - self._metrics_names[-1] if len(self._metrics_names) > 0 else None - ) + self.early_stopping_metric = self._metrics_names[-1] if len(self._metrics_names) > 0 else None def _set_callbacks(self, custom_callbacks): """Setup the callbacks functions. @@ -668,7 +653,7 @@ def _set_callbacks(self, custom_callbacks): custom_callbacks : list of func List of callback functions. - """ + """ # Setup default callbacks history, early stopping and scheduler callbacks = [] self.history = History(self, verbose=self.verbose) @@ -676,9 +661,7 @@ def _set_callbacks(self, custom_callbacks): if (self.early_stopping_metric is not None) and (self.patience > 0): early_stopping = EarlyStopping( early_stopping_metric=self.early_stopping_metric, - is_maximize=( - self._metrics[-1]._maximize if len(self._metrics) > 0 else None - ), + is_maximize=(self._metrics[-1]._maximize if len(self._metrics) > 0 else None), patience=self.patience, ) callbacks.append(early_stopping) @@ -705,9 +688,7 @@ def _set_callbacks(self, custom_callbacks): def _set_optimizer(self): """Setup optimizer.""" - self._optimizer = self.optimizer_fn( - self.network.parameters(), **self.optimizer_params - ) + self._optimizer = self.optimizer_fn(self.network.parameters(), **self.optimizer_params) def _construct_loaders(self, X_train, y_train, eval_set): """Generate dataloaders for train and eval set. @@ -781,9 +762,7 @@ def update_fit_params(self, X_train, y_train, eval_set, weights): 0 for no balancing 1 for automated balancing """ - raise NotImplementedError( - "users must define update_fit_params to use this base class" - ) + raise NotImplementedError("users must define update_fit_params to use this base class") @abstractmethod def compute_loss(self, y_score, y_true): @@ -802,9 +781,7 @@ def compute_loss(self, y_score, y_true): float Loss value """ - raise NotImplementedError( - "users must define compute_loss to use this base class" - ) + raise NotImplementedError("users must define compute_loss to use this base class") @abstractmethod def prepare_target(self, y): @@ -821,6 +798,4 @@ def prepare_target(self, y): `torch.Tensor` Converted target matrix. """ - raise NotImplementedError( - "users must define prepare_target to use this base class" - ) + raise NotImplementedError("users must define prepare_target to use this base class") diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py index 95c2bae2..4cc67f55 100644 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py +++ b/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py @@ -115,13 +115,9 @@ def __init__( shared_feat_transform = torch.nn.ModuleList() for i in range(self.n_shared): if i == 0: - shared_feat_transform.append( - Linear(self.input_dim, 2 * (n_d + n_a), bias=False) - ) + shared_feat_transform.append(Linear(self.input_dim, 2 * (n_d + n_a), bias=False)) else: - shared_feat_transform.append( - Linear(n_d + n_a, 2 * (n_d + n_a), bias=False) - ) + shared_feat_transform.append(Linear(n_d + n_a, 2 * (n_d + n_a), bias=False)) else: shared_feat_transform = None @@ -170,9 +166,7 @@ def forward(self, x, prior=None): steps_output = [] for step in range(self.n_steps): M = self.att_transformers[step](prior, att) - M_loss += torch.mean( - torch.sum(torch.mul(M, torch.log(M + self.epsilon)), dim=1) - ) + M_loss += torch.mean(torch.sum(torch.mul(M, torch.log(M + self.epsilon)), dim=1)) # update prior prior = torch.mul(self.gamma - M, prior) # output @@ -337,15 +331,10 @@ def __init__( raise ValueError("n_shared and n_independent can't be both zero.") self.virtual_batch_size = virtual_batch_size - self.embedder = EmbeddingGenerator(input_dim, - cat_dims, - cat_idxs, - cat_emb_dim, - group_attention_matrix) + self.embedder = EmbeddingGenerator(input_dim, cat_dims, cat_idxs, cat_emb_dim, group_attention_matrix) self.post_embed_dim = self.embedder.post_embed_dim - self.masker = RandomObfuscator(self.pretraining_ratio, - group_matrix=self.embedder.embedding_group_matrix) + self.masker = RandomObfuscator(self.pretraining_ratio, group_matrix=self.embedder.embedding_group_matrix) self.encoder = TabNetEncoder( input_dim=self.post_embed_dim, output_dim=self.post_embed_dim, @@ -474,7 +463,7 @@ def __init__( virtual_batch_size=virtual_batch_size, momentum=momentum, mask_type=mask_type, - group_attention_matrix=group_attention_matrix + group_attention_matrix=group_attention_matrix, ) if self.is_multi_task: @@ -588,11 +577,7 @@ def __init__( raise ValueError("n_shared and n_independent can't be both zero.") self.virtual_batch_size = virtual_batch_size - self.embedder = EmbeddingGenerator(input_dim, - cat_dims, - cat_idxs, - cat_emb_dim, - group_attention_matrix) + self.embedder = EmbeddingGenerator(input_dim, cat_dims, cat_idxs, cat_emb_dim, group_attention_matrix) self.post_embed_dim = self.embedder.post_embed_dim self.tabnet = TabNetNoEmbeddings( @@ -608,7 +593,7 @@ def __init__( virtual_batch_size, momentum, mask_type, - self.embedder.embedding_group_matrix + self.embedder.embedding_group_matrix, ) def forward(self, x): @@ -649,9 +634,7 @@ def __init__( super(AttentiveTransformer, self).__init__() self.fc = Linear(input_dim, group_dim, bias=False) initialize_non_glu(self.fc, input_dim, group_dim) - self.bn = GBN( - group_dim, virtual_batch_size=virtual_batch_size, momentum=momentum - ) + self.bn = GBN(group_dim, virtual_batch_size=virtual_batch_size, momentum=momentum) if mask_type == "sparsemax": # Sparsemax @@ -660,9 +643,7 @@ def __init__( # Entmax self.selector = sparsemax.Entmax15(dim=-1) else: - raise NotImplementedError( - "Please choose either sparsemax" + "or entmax as masktype" - ) + raise NotImplementedError("Please choose either sparsemax" + "or entmax as masktype") def forward(self, priors, processed_feat): x = self.fc(processed_feat) @@ -729,9 +710,7 @@ def __init__( self.specifics = torch.nn.Identity() else: spec_input_dim = input_dim if is_first else output_dim - self.specifics = GLU_Block( - spec_input_dim, output_dim, first=is_first, **params - ) + self.specifics = GLU_Block(spec_input_dim, output_dim, first=is_first, **params) def forward(self, x): x = self.shared(x) @@ -783,9 +762,7 @@ def forward(self, x): class GLU_Layer(torch.nn.Module): - def __init__( - self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02 - ): + def __init__(self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02): super(GLU_Layer, self).__init__() self.output_dim = output_dim @@ -795,9 +772,7 @@ def __init__( self.fc = Linear(input_dim, 2 * output_dim, bias=False) initialize_glu(self.fc, input_dim, 2 * output_dim) - self.bn = GBN( - 2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum - ) + self.bn = GBN(2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum) def forward(self, x): x = self.fc(x) @@ -852,20 +827,23 @@ def __init__(self, input_dim, cat_dims, cat_idxs, cat_emb_dims, group_matrix): # update group matrix n_groups = group_matrix.shape[0] - self.embedding_group_matrix = torch.empty((n_groups, self.post_embed_dim), - device=group_matrix.device) + self.embedding_group_matrix = torch.empty((n_groups, self.post_embed_dim), device=group_matrix.device) for group_idx in range(n_groups): post_emb_idx = 0 cat_feat_counter = 0 for init_feat_idx in range(input_dim): if self.continuous_idx[init_feat_idx] == 1: # this means that no embedding is applied to this column - self.embedding_group_matrix[group_idx, post_emb_idx] = group_matrix[group_idx, init_feat_idx] # noqa + self.embedding_group_matrix[group_idx, post_emb_idx] = group_matrix[ + group_idx, init_feat_idx + ] # noqa post_emb_idx += 1 else: # this is a categorical feature which creates multiple embeddings n_embeddings = cat_emb_dims[cat_feat_counter] - self.embedding_group_matrix[group_idx, post_emb_idx:post_emb_idx+n_embeddings] = group_matrix[group_idx, init_feat_idx] / n_embeddings # noqa + self.embedding_group_matrix[group_idx, post_emb_idx : post_emb_idx + n_embeddings] = ( + group_matrix[group_idx, init_feat_idx] / n_embeddings + ) # noqa post_emb_idx += n_embeddings cat_feat_counter += 1 @@ -886,9 +864,7 @@ def forward(self, x): if is_continuous: cols.append(x[:, feat_init_idx].float().view(-1, 1)) else: - cols.append( - self.embeddings[cat_feat_counter](x[:, feat_init_idx].long()) - ) + cols.append(self.embeddings[cat_feat_counter](x[:, feat_init_idx].long())) cat_feat_counter += 1 # concat post_embeddings = torch.cat(cols, dim=1) @@ -913,7 +889,7 @@ def __init__(self, pretraining_ratio, group_matrix): super(RandomObfuscator, self).__init__() self.pretraining_ratio = pretraining_ratio # group matrix is set to boolean here to pass all posssible information - self.group_matrix = (group_matrix > 0) + 0. + self.group_matrix = (group_matrix > 0) + 0.0 self.num_groups = group_matrix.shape[0] def forward(self, x): @@ -926,9 +902,7 @@ def forward(self, x): """ bs = x.shape[0] - obfuscated_groups = torch.bernoulli( - self.pretraining_ratio * torch.ones((bs, self.num_groups), device=x.device) - ) + obfuscated_groups = torch.bernoulli(self.pretraining_ratio * torch.ones((bs, self.num_groups), device=x.device)) obfuscated_vars = torch.matmul(obfuscated_groups, self.group_matrix) masked_input = torch.mul(1 - obfuscated_vars, x) return masked_input, obfuscated_groups, obfuscated_vars diff --git a/lightautoml/text/embed.py b/lightautoml/text/embed.py index 650a86e6..970e8b3e 100644 --- a/lightautoml/text/embed.py +++ b/lightautoml/text/embed.py @@ -511,3 +511,194 @@ class BasicCatEmbeddingFlat(BasicCatEmbedding): def __init__(self, *args, **kwargs): super(BasicCatEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) + + +class NLinearMemoryEfficient(nn.Module): + """Linear multi-dim embedding from https://github.com/yandex-research/tabular-dl-num-embeddings/tree/c1d9eb63c0685b51d7e1bc081cdce6ffdb8886a8. + + Args: + n : num of features. + d_in: input size. + d_out: output size. + """ + + def __init__(self, n: int, d_in: int, d_out: int) -> None: + super().__init__() + self.layers = nn.ModuleList([nn.Linear(d_in, d_out) for _ in range(n)]) + + def forward(self, x): + return torch.stack([l(x[:, i]) for i, l in enumerate(self.layers)], 1) + + +class Periodic(nn.Module): + """Periodic positional embedding for numeric features from https://github.com/yandex-research/tabular-dl-num-embeddings/tree/c1d9eb63c0685b51d7e1bc081cdce6ffdb8886a8. + + Args: + n_features: num of numeric features + emb_size: output size will be 2*emb_size + sigma: weights will be initialized with N(0,sigma) + flatten_output: if flatten output or not. + """ + + def __init__( + self, n_features: int, emb_size: int = 64, sigma: float = 0.05, flatten_output: bool = False, **kwargs + ) -> None: + super().__init__() + self.n_features = n_features + self.emb_size = emb_size + coefficients = torch.normal(0.0, sigma, (n_features, emb_size)) + self.coefficients = nn.Parameter(coefficients) + self.flatten_output = flatten_output + + @staticmethod + def _cos_sin(x: Tensor) -> Tensor: + return torch.cat([torch.cos(x), torch.sin(x)], -1) + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + if self.flatten_output: + return self.emb_size * 2 * self.n_features + else: + return self.n_features + + def forward(self, x: Tensor) -> Tensor: + x = self._cos_sin(2 * torch.pi * self.coefficients[None] * x[..., None]) + if self.flatten_output: + return x.view(x.shape[0], -1) + return x + + +class PLREmbedding(nn.Module): + """ReLU ◦ Linear ◦ Periodic embedding for numeric features from https://arxiv.org/pdf/2203.05556.pdf. + + Args: + num_dims: int + emb_size: int + sigma: float + flatten_output : bool + """ + + def __init__( + self, + num_dims: int, + embedding_size: Union[int, Tuple[int, ...], List[int]] = 64, + emb_size_periodic: int = 64, + sigma_periodic: float = 0.05, + flatten_output: bool = False, + **kwargs, + ) -> None: + super().__init__() + self.num_dims = num_dims + self.embedding_size = embedding_size + self.layers: list[nn.Module] = [] + self.layers.append(Periodic(num_dims, emb_size_periodic, sigma_periodic)) + self.layers.append(NLinearMemoryEfficient(num_dims, 2 * emb_size_periodic, embedding_size)) + self.layers.append(nn.ReLU()) + self.layers = nn.Sequential(*self.layers) + self.flatten_output = flatten_output + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + if self.flatten_output: + return self.num_dims * self.embedding_size + else: + return self.num_dims + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + + """ + X = X["cont"] + x = self.layers(X) + if self.flatten_output: + return x.view(x.shape[0], -1) + return x + + +class PLREmbeddingFlat(PLREmbedding): + """Flatten version of BasicCatEmbedding.""" + + def __init__(self, *args, **kwargs): + super(PLREmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) + + +class SoftEmbedding(torch.nn.Module): + """ + Soft-one hot encoding embedding technique, from https://arxiv.org/pdf/1708.00065.pdf + In a nutshell, it represents a continuous feature as a weighted average of embeddings + """ + + def __init__(self, num_dims, embedding_size=10, flatten_output: bool = False, **kwargs) -> None: + """ + + Parameters + ---------- + num_embeddings: Number of embeddings to use (cardinality of the embedding table). + embeddings_dim: The dimension of the vector space for projecting the scalar value. + embeddings_init_std: The standard deviation factor for normal initialization of the + embedding matrix weights. + emb_initializer: Dict where keys are feature names and values are callable to initialize + embedding tables + """ + super(SoftEmbedding, self).__init__() + self.embedding_table = torch.nn.Embedding(num_dims, embedding_size) + nn.init.xavier_uniform_(self.embedding_table.weight) + + self.projection_layer = torch.nn.Linear(1, num_dims, bias=True) + self.softmax = torch.nn.Softmax(dim=-1) + self.emb_size = embedding_size + self.num_dims = num_dims + self.flatten_output = flatten_output + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + if self.flatten_output: + return self.num_dims * self.emb_size + else: + return self.num_dims + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + + """ + X = X["cont"] + input_numeric = X.unsqueeze(-1) + weights = self.softmax(self.projection_layer(input_numeric)) + x = (weights.unsqueeze(-1) * self.embedding_table.weight).sum(-2) + if self.flatten_output: + return x.view(x.shape[0], -1) + return x + + +class SoftEmbeddingFlat(SoftEmbedding): + """Flatten version of BasicCatEmbedding.""" + + def __init__(self, *args, **kwargs): + super(SoftEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) From f58b4c5f49e5c3206d0f3eeb017dc773b9d723e6 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Mon, 28 Aug 2023 08:39:05 +0000 Subject: [PATCH 21/49] no-changes --- lightautoml/automl/presets/tabular_presets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lightautoml/automl/presets/tabular_presets.py b/lightautoml/automl/presets/tabular_presets.py index cac85a0e..ff97e075 100755 --- a/lightautoml/automl/presets/tabular_presets.py +++ b/lightautoml/automl/presets/tabular_presets.py @@ -555,7 +555,6 @@ def create_automl(self, **fit_args): self.infer_auto_params(train_data, multilevel_avail) reader = PandasToPandasReader(task=self.task, **self.reader_params) pre_selector = self.get_selector() - levels = [] for n, names in enumerate(self.general_params["use_algos"]): From a50a90fef22a8bf8f7f01b2a210a737c7d45c8d1 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 29 Aug 2023 11:55:11 +0000 Subject: [PATCH 22/49] beautiful CV tutorial --- examples/tutorials/Tutorial_8_CV_preset.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tutorials/Tutorial_8_CV_preset.ipynb b/examples/tutorials/Tutorial_8_CV_preset.ipynb index 18f7c68a..e23668a5 100644 --- a/examples/tutorials/Tutorial_8_CV_preset.ipynb +++ b/examples/tutorials/Tutorial_8_CV_preset.ipynb @@ -3060,7 +3060,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "No we can choose another model from timm. So we will use resnet50.a1_in1k, by default it uses vit_base_patch16_224.augreg_in21k" + "### Now we can choose another model from ```timm```. So we will use ```tf_efficientnetv2_b0.in1k```, by default it uses ```vit_base_patch16_224.augreg_in21k```" ] }, { From c66b3c267e659e310aa57b9757ddb2becefcc482 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 29 Aug 2023 12:37:26 +0000 Subject: [PATCH 23/49] added some changes on comments --- lightautoml/automl/presets/image_config.yml | 6 +++--- lightautoml/automl/presets/tabular_config.yml | 4 ++-- .../tabular_configs/conf_0_sel_type_0.yml | 14 +++++++++---- .../tabular_configs/conf_1_sel_type_1.yml | 12 ++++++++--- .../conf_2_select_mode_1_no_typ.yml | 12 ++++++++--- .../conf_3_sel_type_1_no_inter_lgbm.yml | 12 ++++++++--- .../conf_4_sel_type_0_no_int.yml | 12 ++++++++--- .../conf_5_sel_type_1_tuning_full.yml | 12 ++++++++--- ...f_6_sel_type_1_tuning_full_no_int_lgbm.yml | 12 ++++++++--- lightautoml/ml_algo/dl_model.py | 20 ++++--------------- pyproject.toml | 2 +- 11 files changed, 74 insertions(+), 44 deletions(-) diff --git a/lightautoml/automl/presets/image_config.yml b/lightautoml/automl/presets/image_config.yml index 2ba5b4c7..fa206e3c 100755 --- a/lightautoml/automl/presets/image_config.yml +++ b/lightautoml/automl/presets/image_config.yml @@ -240,7 +240,7 @@ nn_pipeline_params: nn_params: # Look for NN train params here. - # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model + # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint'] or custom torch model model: denselight # embedding_size if needed embedding_size: 10 @@ -266,7 +266,7 @@ nn_params: # add fc layer before model with certain dim num_init_features: null # activation function (str in torch.nn activation functions or custom nn.Module) - act_fun: ReLU + act_fun: LeakyReLU # add noise after dropout layer for more regularization use_noise: false # noise parameter @@ -274,7 +274,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 512, 512] + hidden_size: [512, 256] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block diff --git a/lightautoml/automl/presets/tabular_config.yml b/lightautoml/automl/presets/tabular_config.yml index bcf6abbd..d391d5e8 100755 --- a/lightautoml/automl/presets/tabular_config.yml +++ b/lightautoml/automl/presets/tabular_config.yml @@ -126,7 +126,7 @@ linear_l2_params: # params for NN model nn_params: # Look for NN train params here. - # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model + # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint'] or custom torch model model: denselight # embedding_size if needed embedding_size: 10 @@ -160,7 +160,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 256, 128] + hidden_size: [512, 256] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block diff --git a/lightautoml/automl/presets/tabular_configs/conf_0_sel_type_0.yml b/lightautoml/automl/presets/tabular_configs/conf_0_sel_type_0.yml index dad3f314..95494f9b 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_0_sel_type_0.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_0_sel_type_0.yml @@ -98,9 +98,15 @@ linear_l2_params: # params for NN model nn_params: - # Look for NN train params here. - # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model + # Look for NN train params here. + # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint'] or custom torch model model: denselight + # embedding_size if needed + embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # use model with custom embeddings model_with_emb: false # tune custom network @@ -119,7 +125,7 @@ nn_params: # add fc layer before model with certain dim num_init_features: null # activation function (str in torch.nn activation functions or custom nn.Module) - act_fun: ReLU + act_fun: LeakyReLU # add noise after dropout layer for more regularization use_noise: false # noise parameter @@ -127,7 +133,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 512, 512] + hidden_size: [512, 256] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block diff --git a/lightautoml/automl/presets/tabular_configs/conf_1_sel_type_1.yml b/lightautoml/automl/presets/tabular_configs/conf_1_sel_type_1.yml index 6841b11e..bf202cd6 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_1_sel_type_1.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_1_sel_type_1.yml @@ -100,8 +100,14 @@ linear_l2_params: # params for NN model nn_params: # Look for NN train params here. - # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model + # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint'] or custom torch model model: denselight + # embedding_size if needed + embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # use model with custom embeddings model_with_emb: false # tune custom network @@ -120,7 +126,7 @@ nn_params: # add fc layer before model with certain dim num_init_features: null # activation function (str in torch.nn activation functions or custom nn.Module) - act_fun: ReLU + act_fun: LeakyReLU # add noise after dropout layer for more regularization use_noise: false # noise parameter @@ -128,7 +134,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 512, 512] + hidden_size: [512, 256] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block diff --git a/lightautoml/automl/presets/tabular_configs/conf_2_select_mode_1_no_typ.yml b/lightautoml/automl/presets/tabular_configs/conf_2_select_mode_1_no_typ.yml index cff3a6cd..eeaa2535 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_2_select_mode_1_no_typ.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_2_select_mode_1_no_typ.yml @@ -100,8 +100,14 @@ linear_l2_params: # params for NN model nn_params: # Look for NN train params here. - # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model + # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint'] or custom torch model model: denselight + # embedding_size if needed + embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # use model with custom embeddings model_with_emb: false # tune custom network @@ -120,7 +126,7 @@ nn_params: # add fc layer before model with certain dim num_init_features: null # activation function (str in torch.nn activation functions or custom nn.Module) - act_fun: ReLU + act_fun: LeakyReLU # add noise after dropout layer for more regularization use_noise: false # noise parameter @@ -128,7 +134,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 512, 512] + hidden_size: [512, 256] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block diff --git a/lightautoml/automl/presets/tabular_configs/conf_3_sel_type_1_no_inter_lgbm.yml b/lightautoml/automl/presets/tabular_configs/conf_3_sel_type_1_no_inter_lgbm.yml index 4af9e989..318af04b 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_3_sel_type_1_no_inter_lgbm.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_3_sel_type_1_no_inter_lgbm.yml @@ -100,8 +100,14 @@ linear_l2_params: # params for NN model nn_params: # Look for NN train params here. - # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model + # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint'] or custom torch model model: denselight + # embedding_size if needed + embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # use model with custom embeddings model_with_emb: false # tune custom network @@ -120,7 +126,7 @@ nn_params: # add fc layer before model with certain dim num_init_features: null # activation function (str in torch.nn activation functions or custom nn.Module) - act_fun: ReLU + act_fun: LeakyReLU # add noise after dropout layer for more regularization use_noise: false # noise parameter @@ -128,7 +134,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 512, 512] + hidden_size: [512, 256] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block diff --git a/lightautoml/automl/presets/tabular_configs/conf_4_sel_type_0_no_int.yml b/lightautoml/automl/presets/tabular_configs/conf_4_sel_type_0_no_int.yml index 07ba1a91..6cc47500 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_4_sel_type_0_no_int.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_4_sel_type_0_no_int.yml @@ -100,8 +100,14 @@ linear_l2_params: # params for NN model nn_params: # Look for NN train params here. - # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model + # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint'] or custom torch model model: denselight + # embedding_size if needed + embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # use model with custom embeddings model_with_emb: false # tune custom network @@ -120,7 +126,7 @@ nn_params: # add fc layer before model with certain dim num_init_features: null # activation function (str in torch.nn activation functions or custom nn.Module) - act_fun: ReLU + act_fun: LeakyReLU # add noise after dropout layer for more regularization use_noise: false # noise parameter @@ -128,7 +134,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 512, 512] + hidden_size: [512, 256] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block diff --git a/lightautoml/automl/presets/tabular_configs/conf_5_sel_type_1_tuning_full.yml b/lightautoml/automl/presets/tabular_configs/conf_5_sel_type_1_tuning_full.yml index f0175669..2c7b65fd 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_5_sel_type_1_tuning_full.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_5_sel_type_1_tuning_full.yml @@ -100,8 +100,14 @@ linear_l2_params: # params for NN model nn_params: # Look for NN train params here. - # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model + # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint'] or custom torch model model: denselight + # embedding_size if needed + embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # use model with custom embeddings model_with_emb: false # tune custom network @@ -120,7 +126,7 @@ nn_params: # add fc layer before model with certain dim num_init_features: null # activation function (str in torch.nn activation functions or custom nn.Module) - act_fun: ReLU + act_fun: LeakyReLU # add noise after dropout layer for more regularization use_noise: false # noise parameter @@ -128,7 +134,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 512, 512] + hidden_size: [512, 256] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block diff --git a/lightautoml/automl/presets/tabular_configs/conf_6_sel_type_1_tuning_full_no_int_lgbm.yml b/lightautoml/automl/presets/tabular_configs/conf_6_sel_type_1_tuning_full_no_int_lgbm.yml index 4af9e989..318af04b 100644 --- a/lightautoml/automl/presets/tabular_configs/conf_6_sel_type_1_tuning_full_no_int_lgbm.yml +++ b/lightautoml/automl/presets/tabular_configs/conf_6_sel_type_1_tuning_full_no_int_lgbm.yml @@ -100,8 +100,14 @@ linear_l2_params: # params for NN model nn_params: # Look for NN train params here. - # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn'] or custom torch model + # str in ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint'] or custom torch model model: denselight + # embedding_size if needed + embedding_size: 10 + # str in ['cat', 'cat_no_dropout', 'weighted'] + cat_embedder: "cat" + # str in ['cont', 'linear', 'dense'] + cont_embedder: "cont" # use model with custom embeddings model_with_emb: false # tune custom network @@ -120,7 +126,7 @@ nn_params: # add fc layer before model with certain dim num_init_features: null # activation function (str in torch.nn activation functions or custom nn.Module) - act_fun: ReLU + act_fun: LeakyReLU # add noise after dropout layer for more regularization use_noise: false # noise parameter @@ -128,7 +134,7 @@ nn_params: # use BatchNorm use_bn: true # define hidden layer dimensions for models in ['mlp', 'denselight', 'snn'] - hidden_size: [512, 512, 512] + hidden_size: [512, 256] # dim of intermediate fc is increased times this factor in ResnetModel layer hid_factor: [2, 2] # list of number of layers within each DenseModel block diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 8e6de1b9..22780343 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -113,31 +113,19 @@ def _get_embedder_cat(params): if input_type_by_name[params["model"]] == "seq": - try: - out = cat_embedder_by_name[params["cat_embedder"]] - except KeyError: - out = BasicCatEmbedding + out = cat_embedder_by_name.get(params["cat_embedder"], BasicCatEmbedding) return out else: - try: - out = cat_embedder_by_name_flat[params["cat_embedder"]] - except KeyError: - out = CatEmbedder + out = cat_embedder_by_name_flat.get(params["cat_embedder"], CatEmbedder) return out def _get_embedder_cont(params): if input_type_by_name[params["model"]] == "seq": - try: - out = cont_embedder_by_name[params["cont_embedder"]] - except KeyError: - out = LinearEmbedding + out = cont_embedder_by_name.get(params["cat_embedder"], LinearEmbedding) return out else: - try: - out = cont_embedder_by_name_flat[params["cont_embedder"]] - except KeyError: - out = ContEmbedder + out = cont_embedder_by_name_flat.get(params["cat_embedder"], ContEmbedder) return out diff --git a/pyproject.toml b/pyproject.toml index 352a0cad..85473d4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ transformers = {version = ">=4", optional = true} # CV albumentations = {version = "<=1.0.3", optional = true} -timm = {version = "*", optional = true} +timm = {version = ">=0.9.0", optional = true} opencv-python = {version = "<=4.8.0.74", optional = true} PyWavelets = {version = "*", optional = true} torchvision = [ From d045b8e45b7507f0dff042c8232ffa1d8680b6bc Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 29 Aug 2023 13:46:40 +0000 Subject: [PATCH 24/49] removed useless function --- lightautoml/ml_algo/dl_model.py | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 22780343..844a983c 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -111,24 +111,6 @@ cont_embedder_by_name = {"linear": LinearEmbedding, "dense": DenseEmbedding} -def _get_embedder_cat(params): - if input_type_by_name[params["model"]] == "seq": - out = cat_embedder_by_name.get(params["cat_embedder"], BasicCatEmbedding) - return out - else: - out = cat_embedder_by_name_flat.get(params["cat_embedder"], CatEmbedder) - return out - - -def _get_embedder_cont(params): - if input_type_by_name[params["model"]] == "seq": - out = cont_embedder_by_name.get(params["cat_embedder"], LinearEmbedding) - return out - else: - out = cont_embedder_by_name_flat.get(params["cat_embedder"], ContEmbedder) - return out - - class TorchModel(TabularMLAlgo): """Neural net for tabular datasets. @@ -300,7 +282,11 @@ def _infer_params(self): net=TorchUniversalModel if not params["model_with_emb"] else params["model"], net_params={ "task": self.task, - "cont_embedder_": _get_embedder_cont(params) if is_cont else None, + "cont_embedder_": cont_embedder_by_name.get(params["cont_embedder"], LinearEmbedding) + if input_type_by_name[params["model"]] == "seq" + else cont_embedder_by_name_flat.get(params["cont_embedder"], ContEmbedder) + if is_cont + else None, "cont_params": { "num_dims": params["num_dims"], "input_bn": params["input_bn"], @@ -309,7 +295,11 @@ def _infer_params(self): } if is_cont else None, - "cat_embedder_": _get_embedder_cat(params) if is_cat else None, + "cat_embedder_": cat_embedder_by_name.get(params["cat_embedder"], BasicCatEmbedding) + if input_type_by_name[params["model"]] == "seq" + else cat_embedder_by_name_flat.get(params["cat_embedder"], CatEmbedder) + if is_cat + else None, "cat_params": { "cat_vc": params["cat_vc"], "cat_dims": params["cat_dims"], From 382b385a0e66292625d39aed901852ad0c9a45a9 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 29 Aug 2023 15:59:21 +0000 Subject: [PATCH 25/49] removed for-for --- lightautoml/ml_algo/dl_model.py | 5 ++- lightautoml/text/embed.py | 69 +++++++++++++++++---------------- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 844a983c..bc28fa77 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -405,7 +405,10 @@ def _init_params_on_input(self, train_valid_iterator) -> dict: ) + 1 ) - values, counts = np.unique(train_valid_iterator.train[:, cat_feature].data, return_counts=True) + values, counts = np.unique( + np.concatenate([train_valid_iterator.train[:, cat_feature].data, valid[:, cat_feature].data]), + return_counts=True, + ) cat_value_counts.append(dict(zip(values, counts))) cat_dims.append(num_unique_categories) new_params["cat_dims"] = cat_dims diff --git a/lightautoml/text/embed.py b/lightautoml/text/embed.py index 650a86e6..54a6ee64 100644 --- a/lightautoml/text/embed.py +++ b/lightautoml/text/embed.py @@ -266,6 +266,7 @@ class WeightedCatEmbedding(nn.Module): def __init__( self, + cat_dims: Sequence[int], cat_vc: Sequence[Dict], embedding_size: int = 10, alpha: int = 20, @@ -284,27 +285,29 @@ def __init__( self.num_values = 0 self.embedding: Optional[nn.Embedding] = None self.embedding_size = embedding_size - self._from_summary(cat_vc) + self._from_summary(cat_vc, cat_dims) self.cat_len = len(cat_vc) + self.cat_dims = cat_dims - def _from_summary(self, unique_counts: List[Dict[Any, int]]): - lookup = {} - lookup_default = {} - num_values = 0 + def _from_summary(self, unique_counts: List[Dict[Any, int]], cat_dims: Sequence[int]): + self.emb_layers = nn.ModuleList([nn.Embedding(int(x), self.embedding_size) for x in cat_dims]) + self.def_layers = nn.ModuleList([nn.Embedding(1, 1) for _ in cat_dims]) + weights_list = [] for fieldnum, counts in enumerate(unique_counts): - lookup_default[fieldnum] = (num_values, 0) - num_values += 1 - for value, count in counts.items(): - lookup[(fieldnum, value)] = (num_values, count) - num_values += 1 - + weights = [] + for i, vc in enumerate(sorted(counts.items())): + value, count = vc + if i == 0 and value != 0.0: + weights.append([0]) + weights.append([count / (count + self.alpha)]) + weights_list.append(weights) + self.w_emb_layers = nn.ModuleList( + [nn.Embedding.from_pretrained(torch.tensor(x, dtype=torch.float32)) for x in weights_list] + ) self.num_fields = len(unique_counts) self.output_size = self.num_fields * self.embedding_size - self.lookup = lookup - self.lookup_default = lookup_default - self.num_values = num_values - self.embedding = nn.Embedding(num_values, self.embedding_size) - nn.init.xavier_uniform_(self.embedding.weight) + for emb in self.emb_layers: + nn.init.xavier_uniform_(emb.weight) def get_out_shape(self) -> int: """Output shape. @@ -328,23 +331,23 @@ def forward(self, X: Dict) -> Tensor: torch.Tensor """ X = X["cat"] - list_weights: List[List[List[float]]] = [] - idxs_primary: List[List[int]] = [] - idxs_default: List[List[int]] = [] - for row in X: - list_weights.append([]) - idxs_primary.append([]) - idxs_default.append([]) - for col, val in enumerate(row): - val = val.item() - default = self.lookup_default[col] - idx, count = self.lookup.get((col, val), default) - list_weights[-1].append([count / (count + self.alpha)]) - idxs_primary[-1].append(idx) - idxs_default[-1].append(default[0]) - tsr_weights = torch.tensor(list_weights, dtype=torch.float32, device=self._device) - emb_primary = self.embedding(torch.tensor(idxs_primary, dtype=torch.int64, device=self._device)) - emb_default = self.embedding(torch.tensor(idxs_default, dtype=torch.int64, device=self._device)) + emb_primary = torch.stack( + [emb_layer(X[:, i]) for i, emb_layer in enumerate(self.emb_layers)], + dim=1, + ) + tsr_weights = torch.stack( + [emb_layer(X[:, i]) for i, emb_layer in enumerate(self.w_emb_layers)], + dim=1, + ) + + emb_default = torch.stack( + [ + emb_layer(torch.tensor([0] * len(X[:, i]), device=self._device)) + for i, emb_layer in enumerate(self.def_layers) + ], + dim=1, + ) + x = tsr_weights * emb_primary + (1 - tsr_weights) * emb_default if self.flatten_output: return x.view(x.shape[0], -1) From 4955b2eeff7d53a915d4dfccf2439526af61fa0d Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Wed, 30 Aug 2023 09:40:14 +0000 Subject: [PATCH 26/49] WeightedEmbedder bugfix --- lightautoml/text/embed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightautoml/text/embed.py b/lightautoml/text/embed.py index 54a6ee64..dcb63b6e 100644 --- a/lightautoml/text/embed.py +++ b/lightautoml/text/embed.py @@ -291,7 +291,7 @@ def __init__( def _from_summary(self, unique_counts: List[Dict[Any, int]], cat_dims: Sequence[int]): self.emb_layers = nn.ModuleList([nn.Embedding(int(x), self.embedding_size) for x in cat_dims]) - self.def_layers = nn.ModuleList([nn.Embedding(1, 1) for _ in cat_dims]) + self.def_layers = nn.ModuleList([nn.Embedding(1, self.embedding_size) for _ in cat_dims]) weights_list = [] for fieldnum, counts in enumerate(unique_counts): weights = [] From 1bdf9d52934ede8496fba3bb99c4108164ea08da Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Wed, 30 Aug 2023 09:45:07 +0000 Subject: [PATCH 27/49] delete unused import --- lightautoml/ml_algo/torch_based/nn_models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index 47f7f43d..fe0c6575 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -13,8 +13,6 @@ from lightautoml.ml_algo.torch_based.node_nn_model import DenseODSTBlock, MeanPooling -from lightautoml.ml_algo.torch_based.node_nn_model import DenseODSTBlock, MeanPooling - class GaussianNoise(nn.Module): """Adds gaussian noise. From b51e4deb1f9a36f53e5e096dd777aade092b95a3 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Wed, 30 Aug 2023 10:03:54 +0000 Subject: [PATCH 28/49] changed link --- lightautoml/ml_algo/torch_based/nn_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index fe0c6575..30676df8 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -857,7 +857,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class AutoInt(nn.Module): - """The NODE model from https://github.com/Qwicen. + """The AutoInt model from https://github.com/jrfiedler/xynn. Args: n_in: Input dim. From 6d4a74ea4e1e8fa16a0581c02bdfd1ebe11848d6 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Wed, 30 Aug 2023 13:46:47 +0000 Subject: [PATCH 29/49] add tabnet/plr/softemb --- lightautoml/ml_algo/tabnet/utils.py | 166 ++-- lightautoml/ml_algo/torch_based/nn_models.py | 45 +- .../pytorch_tabnet/abstract_model.py | 801 --------------- .../pytorch_tabnet/augmentations.py | 93 -- .../torch_based/pytorch_tabnet/callbacks.py | 287 ------ .../torch_based/pytorch_tabnet/metrics.py | 515 ---------- .../pytorch_tabnet/multiclass_utils.py | 402 -------- .../torch_based/pytorch_tabnet/multitask.py | 167 ---- .../torch_based/pytorch_tabnet/pretraining.py | 418 -------- .../pytorch_tabnet/pretraining_utils.py | 119 --- .../torch_based/pytorch_tabnet/sparsemax.py | 276 ------ .../torch_based/pytorch_tabnet/tab_model.py | 146 --- .../torch_based/pytorch_tabnet/tab_network.py | 908 ------------------ .../torch_based/pytorch_tabnet/utils.py | 529 ---------- lightautoml/text/embed.py | 17 +- 15 files changed, 141 insertions(+), 4748 deletions(-) delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py delete mode 100755 lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py delete mode 100644 lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py diff --git a/lightautoml/ml_algo/tabnet/utils.py b/lightautoml/ml_algo/tabnet/utils.py index 40845a8b..a901b7c6 100644 --- a/lightautoml/ml_algo/tabnet/utils.py +++ b/lightautoml/ml_algo/tabnet/utils.py @@ -6,14 +6,14 @@ from lightautoml.ml_algo.torch_based.autoint.ghost_norm import GhostBatchNorm -def initialize_non_glu(module, input_dim, output_dim): +def _initialize_non_glu(module, input_dim, output_dim): gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(4 * input_dim)) torch.nn.init.xavier_normal_(module.weight, gain=gain_value) # torch.nn.init.zeros_(module.bias) return -def initialize_glu(module, input_dim, output_dim): +def _initialize_glu(module, input_dim, output_dim): gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(input_dim)) torch.nn.init.xavier_normal_(module.weight, gain=gain_value) # torch.nn.init.zeros_(module.bias) @@ -21,27 +21,11 @@ def initialize_glu(module, input_dim, output_dim): class TabNetEncoder(torch.nn.Module): - def __init__( - self, - input_dim, - output_dim, - n_d=8, - n_a=8, - n_steps=3, - gamma=1.3, - n_independent=2, - n_shared=2, - epsilon=1e-15, - virtual_batch_size=128, - momentum=0.02, - mask_type="sparsemax", - group_attention_matrix=None, - ): - """ - Defines main part of the TabNet network without the embedding layers. + """Defines main part of the TabNet network without the embedding layers. - Parameters - ---------- + Code from https://github.com/dreamquark-ai/tabnet + + Args: input_dim : int Number of features output_dim : int or list of int for multi task classification @@ -69,7 +53,24 @@ def __init__( Either "sparsemax" or "entmax" : this is the masking function to use group_attention_matrix : torch matrix Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j - """ + """ + + def __init__( + self, + input_dim, + output_dim, + n_d=8, + n_a=8, + n_steps=3, + gamma=1.3, + n_independent=2, + n_shared=2, + epsilon=1e-15, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + group_attention_matrix=None, + ): super(TabNetEncoder, self).__init__() self.input_dim = input_dim self.output_dim = output_dim @@ -137,6 +138,15 @@ def __init__( self.att_transformers.append(attention) def forward(self, x, prior=None): + """Forward-pass of encoder. + + Args: + x : input Tensor + prior : mask for AttentiveTransformer + + Returns: + sequence of outputs, regulariztion loss + """ x = self.initial_bn(x) bs = x.shape[0] # batch size @@ -164,6 +174,14 @@ def forward(self, x, prior=None): return steps_output, M_loss def forward_masks(self, x): + """Magic forward-pass of encoder that returns masks. + + Args: + x : input Tensor + + Returns: + new and old masks. + """ x = self.initial_bn(x) bs = x.shape[0] # batch size prior = torch.ones((bs, self.attention_dim)).to(x.device) @@ -191,21 +209,9 @@ def forward_masks(self, x): class FeatTransformer(torch.nn.Module): - def __init__( - self, - input_dim, - output_dim, - shared_layers, - n_glu_independent, - virtual_batch_size=128, - momentum=0.02, - ): - super(FeatTransformer, self).__init__() - """ - Initialize a feature transformer. + """Feature transformer from https://github.com/dreamquark-ai/tabnet. - Parameters - ---------- + Args: input_dim : int Input size output_dim : int @@ -218,8 +224,18 @@ def __init__( Batch size for Ghost Batch Normalization within GLU block(s) momentum : float Float value between 0 and 1 which will be used for momentum in batch norm - """ + """ + def __init__( + self, + input_dim, + output_dim, + shared_layers, + n_glu_independent, + virtual_batch_size=128, + momentum=0.02, + ): + super(FeatTransformer, self).__init__() params = { "n_glu": n_glu_independent, "virtual_batch_size": virtual_batch_size, @@ -250,14 +266,32 @@ def __init__( self.specifics = GLU_Block(spec_input_dim, output_dim, first=is_first, **params) def forward(self, x): + """Forward-pass.""" x = self.shared(x) x = self.specifics(x) return x class GLU_Block(torch.nn.Module): - """ - Independent GLU block, specific to each step + """Independent GLU block, specific to each step. + + Code from https://github.com/dreamquark-ai/tabnet. + + Args: + input_dim : int + Input size + output_dim : int + Output_size + shared_layers : torch.nn.ModuleList + The shared block that should be common to every step + n_glu : int + Number of independent GLU layers + virtual_batch_size : int + Batch size for Ghost Batch Normalization within GLU block(s) + momentum : float + Float value between 0 and 1 which will be used for momentum in batch norm + first : bool + if the first layer of the block has no scale multiplication or not """ def __init__( @@ -285,6 +319,7 @@ def __init__( self.glu_layers.append(GLU_Layer(output_dim, output_dim, fc=fc, **params)) def forward(self, x): + """Forward-pass.""" scale = torch.sqrt(torch.FloatTensor([0.5]).to(x.device)) if self.first: # the first layer of the block has no scale multiplication x = self.glu_layers[0](x) @@ -299,6 +334,22 @@ def forward(self, x): class GLU_Layer(torch.nn.Module): + """GLU layer implementation. + + Args: + input_dim : int + Input size + output_dim : int + Output_size + fc : torch.nn.Module + Optional fully-connected layer + virtual_batch_size : int + Batch size for Ghost Batch Normalization within GLU block(s) + momentum : float + Float value between 0 and 1 which will be used for momentum in batch norm + + """ + def __init__(self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02): super(GLU_Layer, self).__init__() @@ -307,11 +358,12 @@ def __init__(self, input_dim, output_dim, fc=None, virtual_batch_size=128, momen self.fc = fc else: self.fc = nn.Linear(input_dim, 2 * output_dim, bias=False) - initialize_glu(self.fc, input_dim, 2 * output_dim) + _initialize_glu(self.fc, input_dim, 2 * output_dim) self.bn = GhostBatchNorm(2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum) def forward(self, x): + """Forward-pass.""" x = self.fc(x) x = self.bn(x) out = torch.mul(x[:, : self.output_dim], torch.sigmoid(x[:, self.output_dim :])) @@ -319,20 +371,11 @@ def forward(self, x): class AttentiveTransformer(torch.nn.Module): - def __init__( - self, - input_dim, - group_dim, - group_matrix, - virtual_batch_size=128, - momentum=0.02, - mask_type="sparsemax", - ): - """ - Initialize an attention transformer. + """Attention transformer. + + Code from https://github.com/dreamquark-ai/tabnet. - Parameters - ---------- + Args: input_dim : int Input size group_dim : int @@ -343,10 +386,20 @@ def __init__( Float value between 0 and 1 which will be used for momentum in batch norm mask_type : str Either "sparsemax" or "entmax" : this is the masking function to use - """ + """ + + def __init__( + self, + input_dim, + group_dim, + group_matrix, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + ): super(AttentiveTransformer, self).__init__() self.fc = nn.Linear(input_dim, group_dim, bias=False) - initialize_non_glu(self.fc, input_dim, group_dim) + _initialize_non_glu(self.fc, input_dim, group_dim) self.bn = GhostBatchNorm(group_dim, virtual_batch_size=virtual_batch_size, momentum=momentum) if mask_type == "sparsemax": @@ -359,6 +412,7 @@ def __init__( raise NotImplementedError("Please choose either sparsemax" + "or entmax as masktype") def forward(self, priors, processed_feat): + """Forward-pass.""" x = self.fc(processed_feat) x = self.bn(x) x = torch.mul(x, priors) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index 44b57b56..cee78575 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -979,28 +979,9 @@ def forward(self, embedded: torch.Tensor) -> torch.Tensor: class TabNet(torch.nn.Module): - def __init__( - self, - n_in, - n_out, - n_d=8, - n_a=8, - n_steps=3, - gamma=1.3, - n_independent=2, - n_shared=2, - epsilon=1e-15, - virtual_batch_size=128, - momentum=0.02, - mask_type="sparsemax", - group_attention_matrix=None, - **kwargs, - ): - """ - Defines main part of the TabNet network without the embedding layers. + """Implementation of TabNet from https://github.com/dreamquark-ai/tabnet. - Parameters - ---------- + Args: input_dim : int Number of features output_dim : int or list of int for multi task classification @@ -1028,7 +1009,25 @@ def __init__( Either "sparsemax" or "entmax" : this is the masking function to use group_attention_matrix : torch matrix Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j - """ + """ + + def __init__( + self, + n_in, + n_out, + n_d=8, + n_a=8, + n_steps=3, + gamma=1.3, + n_independent=2, + n_shared=2, + epsilon=1e-15, + virtual_batch_size=128, + momentum=0.02, + mask_type="sparsemax", + group_attention_matrix=None, + **kwargs, + ): super(TabNet, self).__init__() self.input_dim = n_in self.output_dim = n_out @@ -1071,6 +1070,7 @@ def __init__( initialize_non_glu(self.final_mapping, n_d, n_out) def forward(self, x): + """Forward-pass.""" res = 0 steps_output, M_loss = self.encoder(x) res = torch.sum(torch.stack(steps_output, dim=0), dim=0) @@ -1085,4 +1085,5 @@ def forward(self, x): return out def forward_masks(self, x): + """Magic forward-pass of encoder that returns masks.""" return self.encoder.forward_masks(x) diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py deleted file mode 100644 index 76c4de53..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/abstract_model.py +++ /dev/null @@ -1,801 +0,0 @@ -from dataclasses import dataclass, field -from typing import List, Any, Dict -import torch -from torch.nn.utils import clip_grad_norm_ -import numpy as np -from scipy.sparse import csc_matrix -from abc import abstractmethod -from pytorch_tabnet import tab_network -from pytorch_tabnet.utils import ( - SparsePredictDataset, - PredictDataset, - create_explain_matrix, - validate_eval_set, - create_dataloaders, - define_device, - ComplexEncoder, - check_input, - check_warm_start, - create_group_matrix, - check_embedding_parameters, -) -from pytorch_tabnet.callbacks import ( - CallbackContainer, - History, - EarlyStopping, - LRSchedulerCallback, -) -from pytorch_tabnet.metrics import MetricContainer, check_metrics -from sklearn.base import BaseEstimator - -from torch.utils.data import DataLoader -import io -import json -from pathlib import Path -import shutil -import zipfile -import warnings -import copy -import scipy - - -@dataclass -class TabModel(BaseEstimator): - """ Class for TabNet model.""" - - n_d: int = 8 - n_a: int = 8 - n_steps: int = 3 - gamma: float = 1.3 - cat_idxs: List[int] = field(default_factory=list) - cat_dims: List[int] = field(default_factory=list) - cat_emb_dim: int = 1 - n_independent: int = 2 - n_shared: int = 2 - epsilon: float = 1e-15 - momentum: float = 0.02 - lambda_sparse: float = 1e-3 - seed: int = 0 - clip_value: int = 1 - verbose: int = 1 - optimizer_fn: Any = torch.optim.Adam - optimizer_params: Dict = field(default_factory=lambda: dict(lr=2e-2)) - scheduler_fn: Any = None - scheduler_params: Dict = field(default_factory=dict) - mask_type: str = "sparsemax" - input_dim: int = None - output_dim: int = None - device_name: str = "auto" - n_shared_decoder: int = 1 - n_indep_decoder: int = 1 - grouped_features: List[List[int]] = field(default_factory=list) - - def __post_init__(self): - # These are default values needed for saving model - self.batch_size = 1024 - self.virtual_batch_size = 128 - - torch.manual_seed(self.seed) - # Defining device - self.device = torch.device(define_device(self.device_name)) - if self.verbose != 0: - warnings.warn(f"Device used : {self.device}") - - # create deep copies of mutable parameters - self.optimizer_fn = copy.deepcopy(self.optimizer_fn) - self.scheduler_fn = copy.deepcopy(self.scheduler_fn) - - updated_params = check_embedding_parameters(self.cat_dims, self.cat_idxs, self.cat_emb_dim) - self.cat_dims, self.cat_idxs, self.cat_emb_dim = updated_params - - def __update__(self, **kwargs): - """ - Updates parameters. - If does not already exists, creates it. - Otherwise overwrite with warnings. - """ - update_list = [ - "cat_dims", - "cat_emb_dim", - "cat_idxs", - "input_dim", - "mask_type", - "n_a", - "n_d", - "n_independent", - "n_shared", - "n_steps", - "grouped_features", - ] - for var_name, value in kwargs.items(): - if var_name in update_list: - try: - exec(f"global previous_val; previous_val = self.{var_name}") - if previous_val != value: # noqa - wrn_msg = f"Pretraining: {var_name} changed from {previous_val} to {value}" # noqa - warnings.warn(wrn_msg) - exec(f"self.{var_name} = value") - except AttributeError: - exec(f"self.{var_name} = value") - - def fit( - self, - X_train, - y_train, - eval_set=None, - eval_name=None, - eval_metric=None, - loss_fn=None, - weights=0, - max_epochs=100, - patience=10, - batch_size=1024, - virtual_batch_size=128, - num_workers=0, - drop_last=True, - callbacks=None, - pin_memory=True, - from_unsupervised=None, - warm_start=False, - augmentations=None, - compute_importance=True, - ): - """Train a neural network stored in self.network - Using train_dataloader for training data and - valid_dataloader for validation. - - Parameters - ---------- - X_train : np.ndarray - Train set - y_train : np.array - Train targets - eval_set : list of tuple - List of eval tuple set (X, y). - The last one is used for early stopping - eval_name : list of str - List of eval set names. - eval_metric : list of str - List of evaluation metrics. - The last metric is used for early stopping. - loss_fn : callable or None - a PyTorch loss function - weights : bool or dictionnary - 0 for no balancing - 1 for automated balancing - dict for custom weights per class - max_epochs : int - Maximum number of epochs during training - patience : int - Number of consecutive non improving epoch before early stopping - batch_size : int - Training batch size - virtual_batch_size : int - Batch size for Ghost Batch Normalization (virtual_batch_size < batch_size) - num_workers : int - Number of workers used in torch.utils.data.DataLoader - drop_last : bool - Whether to drop last batch during training - callbacks : list of callback function - List of custom callbacks - pin_memory: bool - Whether to set pin_memory to True or False during training - from_unsupervised: unsupervised trained model - Use a previously self supervised model as starting weights - warm_start: bool - If True, current model parameters are used to start training - compute_importance : bool - Whether to compute feature importance - """ - # update model name - - self.max_epochs = max_epochs - self.patience = patience - self.batch_size = batch_size - self.virtual_batch_size = virtual_batch_size - self.num_workers = num_workers - self.drop_last = drop_last - self.input_dim = X_train.shape[1] - self._stop_training = False - self.pin_memory = pin_memory and (self.device.type != "cpu") - self.augmentations = augmentations - self.compute_importance = compute_importance - - if self.augmentations is not None: - # This ensure reproducibility - self.augmentations._set_seed() - - eval_set = eval_set if eval_set else [] - - if loss_fn is None: - self.loss_fn = self._default_loss - else: - self.loss_fn = loss_fn - - check_input(X_train) - check_warm_start(warm_start, from_unsupervised) - - self.update_fit_params( - X_train, - y_train, - eval_set, - weights, - ) - - # Validate and reformat eval set depending on training data - eval_names, eval_set = validate_eval_set(eval_set, eval_name, X_train, y_train) - - train_dataloader, valid_dataloaders = self._construct_loaders(X_train, y_train, eval_set) - - if from_unsupervised is not None: - # Update parameters to match self pretraining - self.__update__(**from_unsupervised.get_params()) - - if not hasattr(self, "network") or not warm_start: - # model has never been fitted before of warm_start is False - self._set_network() - self._update_network_params() - self._set_metrics(eval_metric, eval_names) - self._set_optimizer() - self._set_callbacks(callbacks) - - if from_unsupervised is not None: - self.load_weights_from_unsupervised(from_unsupervised) - warnings.warn("Loading weights from unsupervised pretraining") - # Call method on_train_begin for all callbacks - self._callback_container.on_train_begin() - - # Training loop over epochs - for epoch_idx in range(self.max_epochs): - - # Call method on_epoch_begin for all callbacks - self._callback_container.on_epoch_begin(epoch_idx) - - self._train_epoch(train_dataloader) - - # Apply predict epoch to all eval sets - for eval_name, valid_dataloader in zip(eval_names, valid_dataloaders): - self._predict_epoch(eval_name, valid_dataloader) - - # Call method on_epoch_end for all callbacks - self._callback_container.on_epoch_end(epoch_idx, logs=self.history.epoch_metrics) - - if self._stop_training: - break - - # Call method on_train_end for all callbacks - self._callback_container.on_train_end() - self.network.eval() - - if self.compute_importance: - # compute feature importance once the best model is defined - self.feature_importances_ = self._compute_feature_importances(X_train) - - def predict(self, X): - """ - Make predictions on a batch (valid) - - Parameters - ---------- - X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` - Input data - - Returns - ------- - predictions : np.array - Predictions of the regression problem - """ - self.network.eval() - - if scipy.sparse.issparse(X): - dataloader = DataLoader( - SparsePredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - else: - dataloader = DataLoader( - PredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - - results = [] - for batch_nb, data in enumerate(dataloader): - data = data.to(self.device).float() - output, M_loss = self.network(data) - predictions = output.cpu().detach().numpy() - results.append(predictions) - res = np.vstack(results) - return self.predict_func(res) - - def explain(self, X, normalize=False): - """ - Return local explanation - - Parameters - ---------- - X : tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` - Input data - normalize : bool (default False) - Wheter to normalize so that sum of features are equal to 1 - - Returns - ------- - M_explain : matrix - Importance per sample, per columns. - masks : matrix - Sparse matrix showing attention masks used by network. - """ - self.network.eval() - - if scipy.sparse.issparse(X): - dataloader = DataLoader( - SparsePredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - else: - dataloader = DataLoader( - PredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - - res_explain = [] - - for batch_nb, data in enumerate(dataloader): - data = data.to(self.device).float() - - M_explain, masks = self.network.forward_masks(data) - for key, value in masks.items(): - masks[key] = csc_matrix.dot(value.cpu().detach().numpy(), self.reducing_matrix) - original_feat_explain = csc_matrix.dot(M_explain.cpu().detach().numpy(), self.reducing_matrix) - res_explain.append(original_feat_explain) - - if batch_nb == 0: - res_masks = masks - else: - for key, value in masks.items(): - res_masks[key] = np.vstack([res_masks[key], value]) - - res_explain = np.vstack(res_explain) - - if normalize: - res_explain /= np.sum(res_explain, axis=1)[:, None] - - return res_explain, res_masks - - def load_weights_from_unsupervised(self, unsupervised_model): - update_state_dict = copy.deepcopy(self.network.state_dict()) - for param, weights in unsupervised_model.network.state_dict().items(): - if param.startswith("encoder"): - # Convert encoder's layers name to match - new_param = "tabnet." + param - else: - new_param = param - if self.network.state_dict().get(new_param) is not None: - # update only common layers - update_state_dict[new_param] = weights - - self.network.load_state_dict(update_state_dict) - - def load_class_attrs(self, class_attrs): - for attr_name, attr_value in class_attrs.items(): - setattr(self, attr_name, attr_value) - - def save_model(self, path): - """Saving TabNet model in two distinct files. - - Parameters - ---------- - path : str - Path of the model. - - Returns - ------- - str - input filepath with ".zip" appended - - """ - saved_params = {} - init_params = {} - for key, val in self.get_params().items(): - if isinstance(val, type): - # Don't save torch specific params - continue - else: - init_params[key] = val - saved_params["init_params"] = init_params - - class_attrs = {"preds_mapper": self.preds_mapper} - saved_params["class_attrs"] = class_attrs - - # Create folder - Path(path).mkdir(parents=True, exist_ok=True) - - # Save models params - with open(Path(path).joinpath("model_params.json"), "w", encoding="utf8") as f: - json.dump(saved_params, f, cls=ComplexEncoder) - - # Save state_dict - torch.save(self.network.state_dict(), Path(path).joinpath("network.pt")) - shutil.make_archive(path, "zip", path) - shutil.rmtree(path) - print(f"Successfully saved model at {path}.zip") - return f"{path}.zip" - - def load_model(self, filepath): - """Load TabNet model. - - Parameters - ---------- - filepath : str - Path of the model. - """ - try: - with zipfile.ZipFile(filepath) as z: - with z.open("model_params.json") as f: - loaded_params = json.load(f) - loaded_params["init_params"]["device_name"] = self.device_name - with z.open("network.pt") as f: - try: - saved_state_dict = torch.load(f, map_location=self.device) - except io.UnsupportedOperation: - # In Python <3.7, the returned file object is not seekable (which at least - # some versions of PyTorch require) - so we'll try buffering it in to a - # BytesIO instead: - saved_state_dict = torch.load( - io.BytesIO(f.read()), - map_location=self.device, - ) - except KeyError: - raise KeyError("Your zip file is missing at least one component") - - self.__init__(**loaded_params["init_params"]) - - self._set_network() - self.network.load_state_dict(saved_state_dict) - self.network.eval() - self.load_class_attrs(loaded_params["class_attrs"]) - - return - - def _train_epoch(self, train_loader): - """ - Trains one epoch of the network in self.network - - Parameters - ---------- - train_loader : a :class: `torch.utils.data.Dataloader` - DataLoader with train set - """ - self.network.train() - - for batch_idx, (X, y) in enumerate(train_loader): - self._callback_container.on_batch_begin(batch_idx) - - batch_logs = self._train_batch(X, y) - - self._callback_container.on_batch_end(batch_idx, batch_logs) - - epoch_logs = {"lr": self._optimizer.param_groups[-1]["lr"]} - self.history.epoch_metrics.update(epoch_logs) - - return - - def _train_batch(self, X, y): - """ - Trains one batch of data - - Parameters - ---------- - X : torch.Tensor - Train matrix - y : torch.Tensor - Target matrix - - Returns - ------- - batch_outs : dict - Dictionnary with "y": target and "score": prediction scores. - batch_logs : dict - Dictionnary with "batch_size" and "loss". - """ - batch_logs = {"batch_size": X.shape[0]} - - X = X.to(self.device).float() - y = y.to(self.device).float() - - if self.augmentations is not None: - X, y = self.augmentations(X, y) - - for param in self.network.parameters(): - param.grad = None - - output, M_loss = self.network(X) - - loss = self.compute_loss(output, y) - # Add the overall sparsity loss - loss = loss - self.lambda_sparse * M_loss - - # Perform backward pass and optimization - loss.backward() - if self.clip_value: - clip_grad_norm_(self.network.parameters(), self.clip_value) - self._optimizer.step() - - batch_logs["loss"] = loss.cpu().detach().numpy().item() - - return batch_logs - - def _predict_epoch(self, name, loader): - """ - Predict an epoch and update metrics. - - Parameters - ---------- - name : str - Name of the validation set - loader : torch.utils.data.Dataloader - DataLoader with validation set - """ - # Setting network on evaluation mode - self.network.eval() - - list_y_true = [] - list_y_score = [] - - # Main loop - for batch_idx, (X, y) in enumerate(loader): - scores = self._predict_batch(X) - list_y_true.append(y) - list_y_score.append(scores) - - y_true, scores = self.stack_batches(list_y_true, list_y_score) - - metrics_logs = self._metric_container_dict[name](y_true, scores) - self.network.train() - self.history.epoch_metrics.update(metrics_logs) - return - - def _predict_batch(self, X): - """ - Predict one batch of data. - - Parameters - ---------- - X : torch.Tensor - Owned products - - Returns - ------- - np.array - model scores - """ - X = X.to(self.device).float() - - # compute model output - scores, _ = self.network(X) - - if isinstance(scores, list): - scores = [x.cpu().detach().numpy() for x in scores] - else: - scores = scores.cpu().detach().numpy() - - return scores - - def _set_network(self): - """Setup the network and explain matrix.""" - torch.manual_seed(self.seed) - - self.group_matrix = create_group_matrix(self.grouped_features, self.input_dim) - - self.network = tab_network.TabNet( - self.input_dim, - self.output_dim, - n_d=self.n_d, - n_a=self.n_a, - n_steps=self.n_steps, - gamma=self.gamma, - cat_idxs=self.cat_idxs, - cat_dims=self.cat_dims, - cat_emb_dim=self.cat_emb_dim, - n_independent=self.n_independent, - n_shared=self.n_shared, - epsilon=self.epsilon, - virtual_batch_size=self.virtual_batch_size, - momentum=self.momentum, - mask_type=self.mask_type, - group_attention_matrix=self.group_matrix.to(self.device), - ).to(self.device) - - self.reducing_matrix = create_explain_matrix( - self.network.input_dim, - self.network.cat_emb_dim, - self.network.cat_idxs, - self.network.post_embed_dim, - ) - - def _set_metrics(self, metrics, eval_names): - """Set attributes relative to the metrics. - - Parameters - ---------- - metrics : list of str - List of eval metric names. - eval_names : list of str - List of eval set names. - - """ - metrics = metrics or [self._default_metric] - - metrics = check_metrics(metrics) - # Set metric container for each sets - self._metric_container_dict = {} - for name in eval_names: - self._metric_container_dict.update({name: MetricContainer(metrics, prefix=f"{name}_")}) - - self._metrics = [] - self._metrics_names = [] - for _, metric_container in self._metric_container_dict.items(): - self._metrics.extend(metric_container.metrics) - self._metrics_names.extend(metric_container.names) - - # Early stopping metric is the last eval metric - self.early_stopping_metric = self._metrics_names[-1] if len(self._metrics_names) > 0 else None - - def _set_callbacks(self, custom_callbacks): - """Setup the callbacks functions. - - Parameters - ---------- - custom_callbacks : list of func - List of callback functions. - - """ - # Setup default callbacks history, early stopping and scheduler - callbacks = [] - self.history = History(self, verbose=self.verbose) - callbacks.append(self.history) - if (self.early_stopping_metric is not None) and (self.patience > 0): - early_stopping = EarlyStopping( - early_stopping_metric=self.early_stopping_metric, - is_maximize=(self._metrics[-1]._maximize if len(self._metrics) > 0 else None), - patience=self.patience, - ) - callbacks.append(early_stopping) - else: - wrn_msg = "No early stopping will be performed, last training weights will be used." - warnings.warn(wrn_msg) - - if self.scheduler_fn is not None: - # Add LR Scheduler call_back - is_batch_level = self.scheduler_params.pop("is_batch_level", False) - scheduler = LRSchedulerCallback( - scheduler_fn=self.scheduler_fn, - scheduler_params=self.scheduler_params, - optimizer=self._optimizer, - early_stopping_metric=self.early_stopping_metric, - is_batch_level=is_batch_level, - ) - callbacks.append(scheduler) - - if custom_callbacks: - callbacks.extend(custom_callbacks) - self._callback_container = CallbackContainer(callbacks) - self._callback_container.set_trainer(self) - - def _set_optimizer(self): - """Setup optimizer.""" - self._optimizer = self.optimizer_fn(self.network.parameters(), **self.optimizer_params) - - def _construct_loaders(self, X_train, y_train, eval_set): - """Generate dataloaders for train and eval set. - - Parameters - ---------- - X_train : np.array - Train set. - y_train : np.array - Train targets. - eval_set : list of tuple - List of eval tuple set (X, y). - - Returns - ------- - train_dataloader : `torch.utils.data.Dataloader` - Training dataloader. - valid_dataloaders : list of `torch.utils.data.Dataloader` - List of validation dataloaders. - - """ - # all weights are not allowed for this type of model - y_train_mapped = self.prepare_target(y_train) - for i, (X, y) in enumerate(eval_set): - y_mapped = self.prepare_target(y) - eval_set[i] = (X, y_mapped) - - train_dataloader, valid_dataloaders = create_dataloaders( - X_train, - y_train_mapped, - eval_set, - self.updated_weights, - self.batch_size, - self.num_workers, - self.drop_last, - self.pin_memory, - ) - return train_dataloader, valid_dataloaders - - def _compute_feature_importances(self, X): - """Compute global feature importance. - - Parameters - ---------- - loader : `torch.utils.data.Dataloader` - Pytorch dataloader. - - """ - M_explain, _ = self.explain(X, normalize=False) - sum_explain = M_explain.sum(axis=0) - feature_importances_ = sum_explain / np.sum(sum_explain) - return feature_importances_ - - def _update_network_params(self): - self.network.virtual_batch_size = self.virtual_batch_size - - @abstractmethod - def update_fit_params(self, X_train, y_train, eval_set, weights): - """ - Set attributes relative to fit function. - - Parameters - ---------- - X_train : np.ndarray - Train set - y_train : np.array - Train targets - eval_set : list of tuple - List of eval tuple set (X, y). - weights : bool or dictionnary - 0 for no balancing - 1 for automated balancing - """ - raise NotImplementedError("users must define update_fit_params to use this base class") - - @abstractmethod - def compute_loss(self, y_score, y_true): - """ - Compute the loss. - - Parameters - ---------- - y_score : a :tensor: `torch.Tensor` - Score matrix - y_true : a :tensor: `torch.Tensor` - Target matrix - - Returns - ------- - float - Loss value - """ - raise NotImplementedError("users must define compute_loss to use this base class") - - @abstractmethod - def prepare_target(self, y): - """ - Prepare target before training. - - Parameters - ---------- - y : a :tensor: `torch.Tensor` - Target matrix. - - Returns - ------- - `torch.Tensor` - Converted target matrix. - """ - raise NotImplementedError("users must define prepare_target to use this base class") diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py deleted file mode 100644 index b520c0b0..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/augmentations.py +++ /dev/null @@ -1,93 +0,0 @@ -import torch -from pytorch_tabnet.utils import define_device -import numpy as np - - -class RegressionSMOTE: - """ - Apply SMOTE - - This will average a percentage p of the elements in the batch with other elements. - The target will be averaged as well (this might work with binary classification - and certain loss), following a beta distribution. - """ - - def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0): - "" - self.seed = seed - self._set_seed() - self.device = define_device(device_name) - self.alpha = alpha - self.beta = beta - self.p = p - if (p < 0.0) or (p > 1.0): - raise ValueError("Value of p should be between 0. and 1.") - - def _set_seed(self): - torch.manual_seed(self.seed) - np.random.seed(self.seed) - return - - def __call__(self, X, y): - batch_size = X.shape[0] - random_values = torch.rand(batch_size, device=self.device) - idx_to_change = random_values < self.p - - # ensure that first element to switch has probability > 0.5 - np_betas = np.random.beta(self.alpha, self.beta, batch_size) / 2 + 0.5 - random_betas = torch.from_numpy(np_betas).to(self.device).float() - index_permute = torch.randperm(batch_size, device=self.device) - - X[idx_to_change] = random_betas[idx_to_change, None] * X[idx_to_change] - X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view( - X[idx_to_change].size() - ) # noqa - - y[idx_to_change] = random_betas[idx_to_change, None] * y[idx_to_change] - y[idx_to_change] += (1 - random_betas[idx_to_change, None]) * y[index_permute][idx_to_change].view( - y[idx_to_change].size() - ) # noqa - - return X, y - - -class ClassificationSMOTE: - """ - Apply SMOTE for classification tasks. - - This will average a percentage p of the elements in the batch with other elements. - The target will stay unchanged and keep the value of the most important row in the mix. - """ - - def __init__(self, device_name="auto", p=0.8, alpha=0.5, beta=0.5, seed=0): - "" - self.seed = seed - self._set_seed() - self.device = define_device(device_name) - self.alpha = alpha - self.beta = beta - self.p = p - if (p < 0.0) or (p > 1.0): - raise ValueError("Value of p should be between 0. and 1.") - - def _set_seed(self): - torch.manual_seed(self.seed) - np.random.seed(self.seed) - return - - def __call__(self, X, y): - batch_size = X.shape[0] - random_values = torch.rand(batch_size, device=self.device) - idx_to_change = random_values < self.p - - # ensure that first element to switch has probability > 0.5 - np_betas = np.random.beta(self.alpha, self.beta, batch_size) / 2 + 0.5 - random_betas = torch.from_numpy(np_betas).to(self.device).float() - index_permute = torch.randperm(batch_size, device=self.device) - - X[idx_to_change] = random_betas[idx_to_change, None] * X[idx_to_change] - X[idx_to_change] += (1 - random_betas[idx_to_change, None]) * X[index_permute][idx_to_change].view( - X[idx_to_change].size() - ) # noqa - - return X, y diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py deleted file mode 100644 index 5c266502..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/callbacks.py +++ /dev/null @@ -1,287 +0,0 @@ -import time -import datetime -import copy -import numpy as np -from dataclasses import dataclass, field -from typing import List, Any -import warnings - - -class Callback: - """ - Abstract base class used to build new callbacks. - """ - - def __init__(self): - pass - - def set_params(self, params): - self.params = params - - def set_trainer(self, model): - self.trainer = model - - def on_epoch_begin(self, epoch, logs=None): - pass - - def on_epoch_end(self, epoch, logs=None): - pass - - def on_batch_begin(self, batch, logs=None): - pass - - def on_batch_end(self, batch, logs=None): - pass - - def on_train_begin(self, logs=None): - pass - - def on_train_end(self, logs=None): - pass - - -@dataclass -class CallbackContainer: - """ - Container holding a list of callbacks. - """ - - callbacks: List[Callback] = field(default_factory=list) - - def append(self, callback): - self.callbacks.append(callback) - - def set_params(self, params): - for callback in self.callbacks: - callback.set_params(params) - - def set_trainer(self, trainer): - self.trainer = trainer - for callback in self.callbacks: - callback.set_trainer(trainer) - - def on_epoch_begin(self, epoch, logs=None): - logs = logs or {} - for callback in self.callbacks: - callback.on_epoch_begin(epoch, logs) - - def on_epoch_end(self, epoch, logs=None): - logs = logs or {} - for callback in self.callbacks: - callback.on_epoch_end(epoch, logs) - - def on_batch_begin(self, batch, logs=None): - logs = logs or {} - for callback in self.callbacks: - callback.on_batch_begin(batch, logs) - - def on_batch_end(self, batch, logs=None): - logs = logs or {} - for callback in self.callbacks: - callback.on_batch_end(batch, logs) - - def on_train_begin(self, logs=None): - logs = logs or {} - logs["start_time"] = time.time() - for callback in self.callbacks: - callback.on_train_begin(logs) - - def on_train_end(self, logs=None): - logs = logs or {} - for callback in self.callbacks: - callback.on_train_end(logs) - - -@dataclass -class EarlyStopping(Callback): - """EarlyStopping callback to exit the training loop if early_stopping_metric - does not improve by a certain amount for a certain - number of epochs. - - Parameters - --------- - early_stopping_metric : str - Early stopping metric name - is_maximize : bool - Whether to maximize or not early_stopping_metric - tol : float - minimum change in monitored value to qualify as improvement. - This number should be positive. - patience : integer - number of epochs to wait for improvement before terminating. - the counter be reset after each improvement - - """ - - early_stopping_metric: str - is_maximize: bool - tol: float = 0.0 - patience: int = 5 - - def __post_init__(self): - self.best_epoch = 0 - self.stopped_epoch = 0 - self.wait = 0 - self.best_weights = None - self.best_loss = np.inf - if self.is_maximize: - self.best_loss = -self.best_loss - super().__init__() - - def on_epoch_end(self, epoch, logs=None): - current_loss = logs.get(self.early_stopping_metric) - if current_loss is None: - return - - loss_change = current_loss - self.best_loss - max_improved = self.is_maximize and loss_change > self.tol - min_improved = (not self.is_maximize) and (-loss_change > self.tol) - if max_improved or min_improved: - self.best_loss = current_loss - self.best_epoch = epoch - self.wait = 1 - self.best_weights = copy.deepcopy(self.trainer.network.state_dict()) - else: - if self.wait >= self.patience: - self.stopped_epoch = epoch - self.trainer._stop_training = True - self.wait += 1 - - def on_train_end(self, logs=None): - self.trainer.best_epoch = self.best_epoch - self.trainer.best_cost = self.best_loss - - if self.best_weights is not None: - self.trainer.network.load_state_dict(self.best_weights) - - if self.stopped_epoch > 0: - msg = f"\nEarly stopping occurred at epoch {self.stopped_epoch}" - msg += ( - f" with best_epoch = {self.best_epoch} and " - + f"best_{self.early_stopping_metric} = {round(self.best_loss, 5)}" - ) - print(msg) - else: - msg = ( - f"Stop training because you reached max_epochs = {self.trainer.max_epochs}" - + f" with best_epoch = {self.best_epoch} and " - + f"best_{self.early_stopping_metric} = {round(self.best_loss, 5)}" - ) - print(msg) - wrn_msg = "Best weights from best epoch are automatically used!" - warnings.warn(wrn_msg) - - -@dataclass -class History(Callback): - """Callback that records events into a `History` object. - This callback is automatically applied to - every SuperModule. - - Parameters - --------- - trainer : DeepRecoModel - Model class to train - verbose : int - Print results every verbose iteration - - """ - - trainer: Any - verbose: int = 1 - - def __post_init__(self): - super().__init__() - self.samples_seen = 0.0 - self.total_time = 0.0 - - def on_train_begin(self, logs=None): - self.history = {"loss": []} - self.history.update({"lr": []}) - self.history.update({name: [] for name in self.trainer._metrics_names}) - self.start_time = logs["start_time"] - self.epoch_loss = 0.0 - - def on_epoch_begin(self, epoch, logs=None): - self.epoch_metrics = {"loss": 0.0} - self.samples_seen = 0.0 - - def on_epoch_end(self, epoch, logs=None): - self.epoch_metrics["loss"] = self.epoch_loss - for metric_name, metric_value in self.epoch_metrics.items(): - self.history[metric_name].append(metric_value) - if self.verbose == 0: - return - if epoch % self.verbose != 0: - return - msg = f"epoch {epoch:<3}" - for metric_name, metric_value in self.epoch_metrics.items(): - if metric_name != "lr": - msg += f"| {metric_name:<3}: {np.round(metric_value, 5):<8}" - self.total_time = int(time.time() - self.start_time) - msg += f"| {str(datetime.timedelta(seconds=self.total_time)) + 's':<6}" - print(msg) - - def on_batch_end(self, batch, logs=None): - batch_size = logs["batch_size"] - self.epoch_loss = (self.samples_seen * self.epoch_loss + batch_size * logs["loss"]) / ( - self.samples_seen + batch_size - ) - self.samples_seen += batch_size - - def __getitem__(self, name): - return self.history[name] - - def __repr__(self): - return str(self.history) - - def __str__(self): - return str(self.history) - - -@dataclass -class LRSchedulerCallback(Callback): - """Wrapper for most torch scheduler functions. - - Parameters - --------- - scheduler_fn : torch.optim.lr_scheduler - Torch scheduling class - scheduler_params : dict - Dictionnary containing all parameters for the scheduler_fn - is_batch_level : bool (default = False) - If set to False : lr updates will happen at every epoch - If set to True : lr updates happen at every batch - Set this to True for OneCycleLR for example - """ - - scheduler_fn: Any - optimizer: Any - scheduler_params: dict - early_stopping_metric: str - is_batch_level: bool = False - - def __post_init__( - self, - ): - self.is_metric_related = hasattr(self.scheduler_fn, "is_better") - self.scheduler = self.scheduler_fn(self.optimizer, **self.scheduler_params) - super().__init__() - - def on_batch_end(self, batch, logs=None): - if self.is_batch_level: - self.scheduler.step() - else: - pass - - def on_epoch_end(self, epoch, logs=None): - current_loss = logs.get(self.early_stopping_metric) - if current_loss is None: - return - if self.is_batch_level: - pass - else: - if self.is_metric_related: - self.scheduler.step(current_loss) - else: - self.scheduler.step() diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py deleted file mode 100644 index ae716f33..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/metrics.py +++ /dev/null @@ -1,515 +0,0 @@ -from dataclasses import dataclass -from typing import List -import numpy as np -from sklearn.metrics import ( - roc_auc_score, - mean_squared_error, - mean_absolute_error, - accuracy_score, - log_loss, - balanced_accuracy_score, - mean_squared_log_error, -) -import torch - - -def UnsupervisedLoss(y_pred, embedded_x, obf_vars, eps=1e-9): - """ - Implements unsupervised loss function. - This differs from orginal paper as it's scaled to be batch size independent - and number of features reconstructed independent (by taking the mean) - - Parameters - ---------- - y_pred : torch.Tensor or np.array - Reconstructed prediction (with embeddings) - embedded_x : torch.Tensor - Original input embedded by network - obf_vars : torch.Tensor - Binary mask for obfuscated variables. - 1 means the variable was obfuscated so reconstruction is based on this. - eps : float - A small floating point to avoid ZeroDivisionError - This can happen in degenerated case when a feature has only one value - - Returns - ------- - loss : torch float - Unsupervised loss, average value over batch samples. - """ - errors = y_pred - embedded_x - reconstruction_errors = torch.mul(errors, obf_vars) ** 2 - batch_means = torch.mean(embedded_x, dim=0) - batch_means[batch_means == 0] = 1 - - batch_stds = torch.std(embedded_x, dim=0) ** 2 - batch_stds[batch_stds == 0] = batch_means[batch_stds == 0] - features_loss = torch.matmul(reconstruction_errors, 1 / batch_stds) - # compute the number of obfuscated variables to reconstruct - nb_reconstructed_variables = torch.sum(obf_vars, dim=1) - # take the mean of the reconstructed variable errors - features_loss = features_loss / (nb_reconstructed_variables + eps) - # here we take the mean per batch, contrary to the paper - loss = torch.mean(features_loss) - return loss - - -def UnsupervisedLossNumpy(y_pred, embedded_x, obf_vars, eps=1e-9): - errors = y_pred - embedded_x - reconstruction_errors = np.multiply(errors, obf_vars) ** 2 - batch_means = np.mean(embedded_x, axis=0) - batch_means = np.where(batch_means == 0, 1, batch_means) - - batch_stds = np.std(embedded_x, axis=0, ddof=1) ** 2 - batch_stds = np.where(batch_stds == 0, batch_means, batch_stds) - features_loss = np.matmul(reconstruction_errors, 1 / batch_stds) - # compute the number of obfuscated variables to reconstruct - nb_reconstructed_variables = np.sum(obf_vars, axis=1) - # take the mean of the reconstructed variable errors - features_loss = features_loss / (nb_reconstructed_variables + eps) - # here we take the mean per batch, contrary to the paper - loss = np.mean(features_loss) - return loss - - -@dataclass -class UnsupMetricContainer: - """Container holding a list of metrics. - - Parameters - ---------- - y_pred : torch.Tensor or np.array - Reconstructed prediction (with embeddings) - embedded_x : torch.Tensor - Original input embedded by network - obf_vars : torch.Tensor - Binary mask for obfuscated variables. - 1 means the variables was obfuscated so reconstruction is based on this. - - """ - - metric_names: List[str] - prefix: str = "" - - def __post_init__(self): - self.metrics = Metric.get_metrics_by_names(self.metric_names) - self.names = [self.prefix + name for name in self.metric_names] - - def __call__(self, y_pred, embedded_x, obf_vars): - """Compute all metrics and store into a dict. - - Parameters - ---------- - y_true : np.ndarray - Target matrix or vector - y_pred : np.ndarray - Score matrix or vector - - Returns - ------- - dict - Dict of metrics ({metric_name: metric_value}). - - """ - logs = {} - for metric in self.metrics: - res = metric(y_pred, embedded_x, obf_vars) - logs[self.prefix + metric._name] = res - return logs - - -@dataclass -class MetricContainer: - """Container holding a list of metrics. - - Parameters - ---------- - metric_names : list of str - List of metric names. - prefix : str - Prefix of metric names. - - """ - - metric_names: List[str] - prefix: str = "" - - def __post_init__(self): - self.metrics = Metric.get_metrics_by_names(self.metric_names) - self.names = [self.prefix + name for name in self.metric_names] - - def __call__(self, y_true, y_pred): - """Compute all metrics and store into a dict. - - Parameters - ---------- - y_true : np.ndarray - Target matrix or vector - y_pred : np.ndarray - Score matrix or vector - - Returns - ------- - dict - Dict of metrics ({metric_name: metric_value}). - - """ - logs = {} - for metric in self.metrics: - if isinstance(y_pred, list): - res = np.mean([metric(y_true[:, i], y_pred[i]) for i in range(len(y_pred))]) - else: - res = metric(y_true, y_pred) - logs[self.prefix + metric._name] = res - return logs - - -class Metric: - def __call__(self, y_true, y_pred): - raise NotImplementedError("Custom Metrics must implement this function") - - @classmethod - def get_metrics_by_names(cls, names): - """Get list of metric classes. - - Parameters - ---------- - cls : Metric - Metric class. - names : list - List of metric names. - - Returns - ------- - metrics : list - List of metric classes. - - """ - available_metrics = cls.__subclasses__() - available_names = [metric()._name for metric in available_metrics] - metrics = [] - for name in names: - assert name in available_names, f"{name} is not available, choose in {available_names}" - idx = available_names.index(name) - metric = available_metrics[idx]() - metrics.append(metric) - return metrics - - -class AUC(Metric): - """ - AUC. - """ - - def __init__(self): - self._name = "auc" - self._maximize = True - - def __call__(self, y_true, y_score): - """ - Compute AUC of predictions. - - Parameters - ---------- - y_true : np.ndarray - Target matrix or vector - y_score : np.ndarray - Score matrix or vector - - Returns - ------- - float - AUC of predictions vs targets. - """ - return roc_auc_score(y_true, y_score[:, 1]) - - -class Accuracy(Metric): - """ - Accuracy. - """ - - def __init__(self): - self._name = "accuracy" - self._maximize = True - - def __call__(self, y_true, y_score): - """ - Compute Accuracy of predictions. - - Parameters - ---------- - y_true: np.ndarray - Target matrix or vector - y_score: np.ndarray - Score matrix or vector - - Returns - ------- - float - Accuracy of predictions vs targets. - """ - y_pred = np.argmax(y_score, axis=1) - return accuracy_score(y_true, y_pred) - - -class BalancedAccuracy(Metric): - """ - Balanced Accuracy. - """ - - def __init__(self): - self._name = "balanced_accuracy" - self._maximize = True - - def __call__(self, y_true, y_score): - """ - Compute Accuracy of predictions. - - Parameters - ---------- - y_true : np.ndarray - Target matrix or vector - y_score : np.ndarray - Score matrix or vector - - Returns - ------- - float - Accuracy of predictions vs targets. - """ - y_pred = np.argmax(y_score, axis=1) - return balanced_accuracy_score(y_true, y_pred) - - -class LogLoss(Metric): - """ - LogLoss. - """ - - def __init__(self): - self._name = "logloss" - self._maximize = False - - def __call__(self, y_true, y_score): - """ - Compute LogLoss of predictions. - - Parameters - ---------- - y_true : np.ndarray - Target matrix or vector - y_score : np.ndarray - Score matrix or vector - - Returns - ------- - float - LogLoss of predictions vs targets. - """ - return log_loss(y_true, y_score) - - -class MAE(Metric): - """ - Mean Absolute Error. - """ - - def __init__(self): - self._name = "mae" - self._maximize = False - - def __call__(self, y_true, y_score): - """ - Compute MAE (Mean Absolute Error) of predictions. - - Parameters - ---------- - y_true : np.ndarray - Target matrix or vector - y_score : np.ndarray - Score matrix or vector - - Returns - ------- - float - MAE of predictions vs targets. - """ - return mean_absolute_error(y_true, y_score) - - -class MSE(Metric): - """ - Mean Squared Error. - """ - - def __init__(self): - self._name = "mse" - self._maximize = False - - def __call__(self, y_true, y_score): - """ - Compute MSE (Mean Squared Error) of predictions. - - Parameters - ---------- - y_true : np.ndarray - Target matrix or vector - y_score : np.ndarray - Score matrix or vector - - Returns - ------- - float - MSE of predictions vs targets. - """ - return mean_squared_error(y_true, y_score) - - -class RMSLE(Metric): - """ - Root Mean squared logarithmic error regression loss. - Scikit-implementation: - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html - Note: In order to avoid error, negative predictions are clipped to 0. - This means that you should clip negative predictions manually after calling predict. - """ - - def __init__(self): - self._name = "rmsle" - self._maximize = False - - def __call__(self, y_true, y_score): - """ - Compute RMSLE of predictions. - - Parameters - ---------- - y_true : np.ndarray - Target matrix or vector - y_score : np.ndarray - Score matrix or vector - - Returns - ------- - float - RMSLE of predictions vs targets. - """ - y_score = np.clip(y_score, a_min=0, a_max=None) - return np.sqrt(mean_squared_log_error(y_true, y_score)) - - -class UnsupervisedMetric(Metric): - """ - Unsupervised metric - """ - - def __init__(self): - self._name = "unsup_loss" - self._maximize = False - - def __call__(self, y_pred, embedded_x, obf_vars): - """ - Compute MSE (Mean Squared Error) of predictions. - - Parameters - ---------- - y_pred : torch.Tensor or np.array - Reconstructed prediction (with embeddings) - embedded_x : torch.Tensor - Original input embedded by network - obf_vars : torch.Tensor - Binary mask for obfuscated variables. - 1 means the variables was obfuscated so reconstruction is based on this. - - Returns - ------- - float - MSE of predictions vs targets. - """ - loss = UnsupervisedLoss(y_pred, embedded_x, obf_vars) - return loss.item() - - -class UnsupervisedNumpyMetric(Metric): - """ - Unsupervised metric - """ - - def __init__(self): - self._name = "unsup_loss_numpy" - self._maximize = False - - def __call__(self, y_pred, embedded_x, obf_vars): - """ - Compute MSE (Mean Squared Error) of predictions. - - Parameters - ---------- - y_pred : torch.Tensor or np.array - Reconstructed prediction (with embeddings) - embedded_x : torch.Tensor - Original input embedded by network - obf_vars : torch.Tensor - Binary mask for obfuscated variables. - 1 means the variables was obfuscated so reconstruction is based on this. - - Returns - ------- - float - MSE of predictions vs targets. - """ - return UnsupervisedLossNumpy(y_pred, embedded_x, obf_vars) - - -class RMSE(Metric): - """ - Root Mean Squared Error. - """ - - def __init__(self): - self._name = "rmse" - self._maximize = False - - def __call__(self, y_true, y_score): - """ - Compute RMSE (Root Mean Squared Error) of predictions. - - Parameters - ---------- - y_true : np.ndarray - Target matrix or vector - y_score : np.ndarray - Score matrix or vector - - Returns - ------- - float - RMSE of predictions vs targets. - """ - return np.sqrt(mean_squared_error(y_true, y_score)) - - -def check_metrics(metrics): - """Check if custom metrics are provided. - - Parameters - ---------- - metrics : list of str or classes - List with built-in metrics (str) or custom metrics (classes). - - Returns - ------- - val_metrics : list of str - List of metric names. - - """ - val_metrics = [] - for metric in metrics: - if isinstance(metric, str): - val_metrics.append(metric) - elif issubclass(metric, Metric): - val_metrics.append(metric()._name) - else: - raise TypeError("You need to provide a valid metric format") - return val_metrics diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py deleted file mode 100644 index b6fa2ef3..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/multiclass_utils.py +++ /dev/null @@ -1,402 +0,0 @@ -# Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi -# -# License: BSD 3 clause -""" -Multi-class / multi-label utility function -========================================== - -""" -from collections.abc import Sequence -from itertools import chain - -from scipy.sparse import issparse -from scipy.sparse.base import spmatrix -from scipy.sparse import dok_matrix -from scipy.sparse import lil_matrix -import scipy.sparse as sp - -import numpy as np -import pandas as pd - - -def _assert_all_finite(X, allow_nan=False): - """Like assert_all_finite, but only for ndarray.""" - - X = np.asanyarray(X) - # First try an O(n) time, O(1) space solution for the common case that - # everything is finite; fall back to O(n) space np.isfinite to prevent - # false positives from overflow in sum method. The sum is also calculated - # safely to reduce dtype induced overflows. - is_float = X.dtype.kind in "fc" - if is_float and (np.isfinite(np.sum(X))): - pass - elif is_float: - msg_err = "Input contains {} or a value too large for {!r}." - if allow_nan and np.isinf(X).any() or not allow_nan and not np.isfinite(X).all(): - type_err = "infinity" if allow_nan else "NaN, infinity" - raise ValueError(msg_err.format(type_err, X.dtype)) - # for object dtype data, we only check for NaNs (GH-13254) - elif X.dtype == np.dtype("object") and not allow_nan: - if np.isnan(X).any(): - raise ValueError("Input contains NaN") - - -def assert_all_finite(X, allow_nan=False): - """Throw a ValueError if X contains NaN or infinity. - - Parameters - ---------- - X : array or sparse matrix - allow_nan : bool - """ - _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan) - - -def _unique_multiclass(y): - if hasattr(y, "__array__"): - return np.unique(np.asarray(y)) - else: - return set(y) - - -def _unique_indicator(y): - """ - Not implemented - """ - raise IndexError( - f"""Given labels are of size {y.shape} while they should be (n_samples,) \n""" - + """If attempting multilabel classification, try using TabNetMultiTaskClassification """ - + """or TabNetRegressor""" - ) - - -_FN_UNIQUE_LABELS = { - "binary": _unique_multiclass, - "multiclass": _unique_multiclass, - "multilabel-indicator": _unique_indicator, -} - - -def unique_labels(*ys): - """Extract an ordered array of unique labels - - We don't allow: - - mix of multilabel and multiclass (single label) targets - - mix of label indicator matrix and anything else, - because there are no explicit labels) - - mix of label indicator matrices of different sizes - - mix of string and integer labels - - At the moment, we also don't allow "multiclass-multioutput" input type. - - Parameters - ---------- - *ys : array-likes - - Returns - ------- - out : numpy array of shape [n_unique_labels] - An ordered array of unique labels. - - Examples - -------- - >>> from sklearn.utils.multiclass import unique_labels - >>> unique_labels([3, 5, 5, 5, 7, 7]) - array([3, 5, 7]) - >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]) - array([1, 2, 3, 4]) - >>> unique_labels([1, 2, 10], [5, 11]) - array([ 1, 2, 5, 10, 11]) - """ - if not ys: - raise ValueError("No argument has been passed.") - # Check that we don't mix label format - - ys_types = set(type_of_target(x) for x in ys) - if ys_types == {"binary", "multiclass"}: - ys_types = {"multiclass"} - - if len(ys_types) > 1: - raise ValueError("Mix type of y not allowed, got types %s" % ys_types) - - label_type = ys_types.pop() - - # Get the unique set of labels - _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) - if not _unique_labels: - raise ValueError("Unknown label type: %s" % repr(ys)) - - ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys)) - - # Check that we don't mix string type with number type - if len(set(isinstance(label, str) for label in ys_labels)) > 1: - raise ValueError("Mix of label input types (string and number)") - - return np.array(sorted(ys_labels)) - - -def _is_integral_float(y): - return y.dtype.kind == "f" and np.all(y.astype(int) == y) - - -def is_multilabel(y): - """Check if ``y`` is in a multilabel format. - - Parameters - ---------- - y : numpy array of shape [n_samples] - Target values. - - Returns - ------- - out : bool - Return ``True``, if ``y`` is in a multilabel format, else ```False``. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.utils.multiclass import is_multilabel - >>> is_multilabel([0, 1, 0, 1]) - False - >>> is_multilabel([[1], [0, 2], []]) - False - >>> is_multilabel(np.array([[1, 0], [0, 0]])) - True - >>> is_multilabel(np.array([[1], [0], [0]])) - False - >>> is_multilabel(np.array([[1, 0, 0]])) - True - """ - if hasattr(y, "__array__"): - y = np.asarray(y) - if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1): - return False - - if issparse(y): - if isinstance(y, (dok_matrix, lil_matrix)): - y = y.tocsr() - return ( - len(y.data) == 0 - or np.unique(y.data).size == 1 - and (y.dtype.kind in "biu" or _is_integral_float(np.unique(y.data))) # bool, int, uint - ) - else: - labels = np.unique(y) - - return len(labels) < 3 and (y.dtype.kind in "biu" or _is_integral_float(labels)) # bool, int, uint - - -def check_classification_targets(y): - """Ensure that target y is of a non-regression type. - - Only the following target types (as defined in type_of_target) are allowed: - 'binary', 'multiclass', 'multiclass-multioutput', - 'multilabel-indicator', 'multilabel-sequences' - - Parameters - ---------- - y : array-like - """ - y_type = type_of_target(y) - if y_type not in [ - "binary", - "multiclass", - "multiclass-multioutput", - "multilabel-indicator", - "multilabel-sequences", - ]: - raise ValueError("Unknown label type: %r" % y_type) - - -def type_of_target(y): - """Determine the type of data indicated by the target. - - Note that this type is the most specific type that can be inferred. - For example: - - * ``binary`` is more specific but compatible with ``multiclass``. - * ``multiclass`` of integers is more specific but compatible with - ``continuous``. - * ``multilabel-indicator`` is more specific but compatible with - ``multiclass-multioutput``. - - Parameters - ---------- - y : array-like - - Returns - ------- - target_type : string - One of: - - * 'continuous': `y` is an array-like of floats that are not all - integers, and is 1d or a column vector. - * 'continuous-multioutput': `y` is a 2d array of floats that are - not all integers, and both dimensions are of size > 1. - * 'binary': `y` contains <= 2 discrete values and is 1d or a column - vector. - * 'multiclass': `y` contains more than two discrete values, is not a - sequence of sequences, and is 1d or a column vector. - * 'multiclass-multioutput': `y` is a 2d array that contains more - than two discrete values, is not a sequence of sequences, and both - dimensions are of size > 1. - * 'multilabel-indicator': `y` is a label indicator matrix, an array - of two dimensions with at least two columns, and at most 2 unique - values. - * 'unknown': `y` is array-like but none of the above, such as a 3d - array, sequence of sequences, or an array of non-sequence objects. - - Examples - -------- - >>> import numpy as np - >>> type_of_target([0.1, 0.6]) - 'continuous' - >>> type_of_target([1, -1, -1, 1]) - 'binary' - >>> type_of_target(['a', 'b', 'a']) - 'binary' - >>> type_of_target([1.0, 2.0]) - 'binary' - >>> type_of_target([1, 0, 2]) - 'multiclass' - >>> type_of_target([1.0, 0.0, 3.0]) - 'multiclass' - >>> type_of_target(['a', 'b', 'c']) - 'multiclass' - >>> type_of_target(np.array([[1, 2], [3, 1]])) - 'multiclass-multioutput' - >>> type_of_target([[1, 2]]) - 'multiclass-multioutput' - >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]])) - 'continuous-multioutput' - >>> type_of_target(np.array([[0, 1], [1, 1]])) - 'multilabel-indicator' - """ - valid = (isinstance(y, (Sequence, spmatrix)) or hasattr(y, "__array__")) and not isinstance(y, str) - - if not valid: - raise ValueError("Expected array-like (array or non-string sequence), " "got %r" % y) - - sparseseries = y.__class__.__name__ == "SparseSeries" - if sparseseries: - raise ValueError("y cannot be class 'SparseSeries'.") - - if is_multilabel(y): - return "multilabel-indicator" - - try: - y = np.asarray(y) - except ValueError: - # Known to fail in numpy 1.3 for array of arrays - return "unknown" - - # The old sequence of sequences format - try: - if not hasattr(y[0], "__array__") and isinstance(y[0], Sequence) and not isinstance(y[0], str): - raise ValueError( - "You appear to be using a legacy multi-label data" - " representation. Sequence of sequences are no" - " longer supported; use a binary array or sparse" - " matrix instead - the MultiLabelBinarizer" - " transformer can convert to this format." - ) - except IndexError: - pass - - # Invalid inputs - if y.ndim > 2 or (y.dtype == object and len(y) and not isinstance(y.flat[0], str)): - return "unknown" # [[[1, 2]]] or [obj_1] and not ["label_1"] - - if y.ndim == 2 and y.shape[1] == 0: - return "unknown" # [[]] - - if y.ndim == 2 and y.shape[1] > 1: - suffix = "-multioutput" # [[1, 2], [1, 2]] - else: - suffix = "" # [1, 2, 3] or [[1], [2], [3]] - - # check float and contains non-integer float values - if y.dtype.kind == "f" and np.any(y != y.astype(int)): - # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] - _assert_all_finite(y) - return "continuous" + suffix - - if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): - return "multiclass" + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] - else: - return "binary" # [1, 2] or [["a"], ["b"]] - - -def check_unique_type(y): - target_types = pd.Series(y).map(type).unique() - if len(target_types) != 1: - raise TypeError(f"Values on the target must have the same type. Target has types {target_types}") - - -def infer_output_dim(y_train): - """ - Infer output_dim from targets - - Parameters - ---------- - y_train : np.array - Training targets - - Returns - ------- - output_dim : int - Number of classes for output - train_labels : list - Sorted list of initial classes - """ - check_unique_type(y_train) - train_labels = unique_labels(y_train) - output_dim = len(train_labels) - - return output_dim, train_labels - - -def check_output_dim(labels, y): - if y is not None: - check_unique_type(y) - valid_labels = unique_labels(y) - if not set(valid_labels).issubset(set(labels)): - raise ValueError( - f"""Valid set -- {set(valid_labels)} -- - contains unkown targets from training -- - {set(labels)}""" - ) - return - - -def infer_multitask_output(y_train): - """ - Infer output_dim from targets - This is for multiple tasks. - - Parameters - ---------- - y_train : np.ndarray - Training targets - - Returns - ------- - tasks_dims : list - Number of classes for output - tasks_labels : list - List of sorted list of initial classes - """ - - if len(y_train.shape) < 2: - raise ValueError("y_train should be of shape (n_examples, n_tasks)" + f"but got {y_train.shape}") - nb_tasks = y_train.shape[1] - tasks_dims = [] - tasks_labels = [] - for task_idx in range(nb_tasks): - try: - output_dim, train_labels = infer_output_dim(y_train[:, task_idx]) - tasks_dims.append(output_dim) - tasks_labels.append(train_labels) - except ValueError as err: - raise ValueError(f"""Error for task {task_idx} : {err}""") - return tasks_dims, tasks_labels diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py deleted file mode 100644 index 309c0e39..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/multitask.py +++ /dev/null @@ -1,167 +0,0 @@ -import torch -import numpy as np -from scipy.special import softmax -from pytorch_tabnet.utils import SparsePredictDataset, PredictDataset, filter_weights -from pytorch_tabnet.abstract_model import TabModel -from pytorch_tabnet.multiclass_utils import infer_multitask_output, check_output_dim -from torch.utils.data import DataLoader -import scipy - - -class TabNetMultiTaskClassifier(TabModel): - def __post_init__(self): - super(TabNetMultiTaskClassifier, self).__post_init__() - self._task = "classification" - self._default_loss = torch.nn.functional.cross_entropy - self._default_metric = "logloss" - - def prepare_target(self, y): - y_mapped = y.copy() - for task_idx in range(y.shape[1]): - task_mapper = self.target_mapper[task_idx] - y_mapped[:, task_idx] = np.vectorize(task_mapper.get)(y[:, task_idx]) - return y_mapped - - def compute_loss(self, y_pred, y_true): - """ - Computes the loss according to network output and targets - - Parameters - ---------- - y_pred : list of tensors - Output of network - y_true : LongTensor - Targets label encoded - - Returns - ------- - loss : torch.Tensor - output of loss function(s) - - """ - loss = 0 - y_true = y_true.long() - if isinstance(self.loss_fn, list): - # if you specify a different loss for each task - for task_loss, task_output, task_id in zip(self.loss_fn, y_pred, range(len(self.loss_fn))): - loss += task_loss(task_output, y_true[:, task_id]) - else: - # same loss function is applied to all tasks - for task_id, task_output in enumerate(y_pred): - loss += self.loss_fn(task_output, y_true[:, task_id]) - - loss /= len(y_pred) - return loss - - def stack_batches(self, list_y_true, list_y_score): - y_true = np.vstack(list_y_true) - y_score = [] - for i in range(len(self.output_dim)): - score = np.vstack([x[i] for x in list_y_score]) - score = softmax(score, axis=1) - y_score.append(score) - return y_true, y_score - - def update_fit_params(self, X_train, y_train, eval_set, weights): - output_dim, train_labels = infer_multitask_output(y_train) - for _, y in eval_set: - for task_idx in range(y.shape[1]): - check_output_dim(train_labels[task_idx], y[:, task_idx]) - self.output_dim = output_dim - self.classes_ = train_labels - self.target_mapper = [ - {class_label: index for index, class_label in enumerate(classes)} for classes in self.classes_ - ] - self.preds_mapper = [ - {str(index): str(class_label) for index, class_label in enumerate(classes)} for classes in self.classes_ - ] - self.updated_weights = weights - filter_weights(self.updated_weights) - - def predict(self, X): - """ - Make predictions on a batch (valid) - - Parameters - ---------- - X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` - Input data - - Returns - ------- - results : np.array - Predictions of the most probable class - """ - self.network.eval() - - if scipy.sparse.issparse(X): - dataloader = DataLoader( - SparsePredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - else: - dataloader = DataLoader( - PredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - - results = {} - for data in dataloader: - data = data.to(self.device).float() - output, _ = self.network(data) - predictions = [ - torch.argmax(torch.nn.Softmax(dim=1)(task_output), dim=1).cpu().detach().numpy().reshape(-1) - for task_output in output - ] - - for task_idx in range(len(self.output_dim)): - results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]] - # stack all task individually - results = [np.hstack(task_res) for task_res in results.values()] - # map all task individually - results = [ - np.vectorize(self.preds_mapper[task_idx].get)(task_res.astype(str)) - for task_idx, task_res in enumerate(results) - ] - return results - - def predict_proba(self, X): - """ - Make predictions for classification on a batch (valid) - - Parameters - ---------- - X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` - Input data - - Returns - ------- - res : list of np.ndarray - - """ - self.network.eval() - - if scipy.sparse.issparse(X): - dataloader = DataLoader( - SparsePredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - else: - dataloader = DataLoader( - PredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - - results = {} - for data in dataloader: - data = data.to(self.device).float() - output, _ = self.network(data) - predictions = [torch.nn.Softmax(dim=1)(task_output).cpu().detach().numpy() for task_output in output] - for task_idx in range(len(self.output_dim)): - results[task_idx] = results.get(task_idx, []) + [predictions[task_idx]] - res = [np.vstack(task_res) for task_res in results.values()] - return res diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py deleted file mode 100644 index 9044d497..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining.py +++ /dev/null @@ -1,418 +0,0 @@ -import torch -import numpy as np -from torch.utils.data import DataLoader -from pytorch_tabnet import tab_network -from pytorch_tabnet.utils import ( - create_explain_matrix, - filter_weights, - SparsePredictDataset, - PredictDataset, - check_input, - create_group_matrix, -) -from torch.nn.utils import clip_grad_norm_ -from pytorch_tabnet.pretraining_utils import ( - create_dataloaders, - validate_eval_set, -) -from pytorch_tabnet.metrics import ( - UnsupMetricContainer, - check_metrics, - UnsupervisedLoss, -) -from pytorch_tabnet.abstract_model import TabModel -import scipy - - -class TabNetPretrainer(TabModel): - def __post_init__(self): - super(TabNetPretrainer, self).__post_init__() - self._task = "unsupervised" - self._default_loss = UnsupervisedLoss - self._default_metric = "unsup_loss_numpy" - - def prepare_target(self, y): - return y - - def compute_loss(self, output, embedded_x, obf_vars): - return self.loss_fn(output, embedded_x, obf_vars) - - def update_fit_params( - self, - weights, - ): - self.updated_weights = weights - filter_weights(self.updated_weights) - self.preds_mapper = None - - def fit( - self, - X_train, - eval_set=None, - eval_name=None, - loss_fn=None, - pretraining_ratio=0.5, - weights=0, - max_epochs=100, - patience=10, - batch_size=1024, - virtual_batch_size=128, - num_workers=0, - drop_last=True, - callbacks=None, - pin_memory=True, - warm_start=False, - ): - """Train a neural network stored in self.network - Using train_dataloader for training data and - valid_dataloader for validation. - - Parameters - ---------- - X_train : np.ndarray - Train set to reconstruct in self supervision - eval_set : list of np.array - List of evaluation set - The last one is used for early stopping - eval_name : list of str - List of eval set names. - eval_metric : list of str - List of evaluation metrics. - The last metric is used for early stopping. - loss_fn : callable or None - a PyTorch loss function - should be left to None for self supervised and non experts - pretraining_ratio : float - Between 0 and 1, percentage of feature to mask for reconstruction - weights : np.array - Sampling weights for each example. - max_epochs : int - Maximum number of epochs during training - patience : int - Number of consecutive non improving epoch before early stopping - batch_size : int - Training batch size - virtual_batch_size : int - Batch size for Ghost Batch Normalization (virtual_batch_size < batch_size) - num_workers : int - Number of workers used in torch.utils.data.DataLoader - drop_last : bool - Whether to drop last batch during training - callbacks : list of callback function - List of custom callbacks - pin_memory: bool - Whether to set pin_memory to True or False during training - """ - # update model name - - self.max_epochs = max_epochs - self.patience = patience - self.batch_size = batch_size - self.virtual_batch_size = virtual_batch_size - self.num_workers = num_workers - self.drop_last = drop_last - self.input_dim = X_train.shape[1] - self._stop_training = False - self.pin_memory = pin_memory and (self.device.type != "cpu") - self.pretraining_ratio = pretraining_ratio - eval_set = eval_set if eval_set else [] - - if loss_fn is None: - self.loss_fn = self._default_loss - else: - self.loss_fn = loss_fn - - check_input(X_train) - - self.update_fit_params( - weights, - ) - - # Validate and reformat eval set depending on training data - eval_names = validate_eval_set(eval_set, eval_name, X_train) - train_dataloader, valid_dataloaders = self._construct_loaders(X_train, eval_set) - - if not hasattr(self, "network") or not warm_start: - # model has never been fitted before of warm_start is False - self._set_network() - - self._update_network_params() - self._set_metrics(eval_names) - self._set_optimizer() - self._set_callbacks(callbacks) - - # Call method on_train_begin for all callbacks - self._callback_container.on_train_begin() - - # Training loop over epochs - for epoch_idx in range(self.max_epochs): - - # Call method on_epoch_begin for all callbacks - self._callback_container.on_epoch_begin(epoch_idx) - - self._train_epoch(train_dataloader) - - # Apply predict epoch to all eval sets - for eval_name, valid_dataloader in zip(eval_names, valid_dataloaders): - self._predict_epoch(eval_name, valid_dataloader) - - # Call method on_epoch_end for all callbacks - self._callback_container.on_epoch_end(epoch_idx, logs=self.history.epoch_metrics) - - if self._stop_training: - break - - # Call method on_train_end for all callbacks - self._callback_container.on_train_end() - self.network.eval() - - def _set_network(self): - """Setup the network and explain matrix.""" - if not hasattr(self, "pretraining_ratio"): - self.pretraining_ratio = 0.5 - torch.manual_seed(self.seed) - - self.group_matrix = create_group_matrix(self.grouped_features, self.input_dim) - - self.network = tab_network.TabNetPretraining( - self.input_dim, - pretraining_ratio=self.pretraining_ratio, - n_d=self.n_d, - n_a=self.n_a, - n_steps=self.n_steps, - gamma=self.gamma, - cat_idxs=self.cat_idxs, - cat_dims=self.cat_dims, - cat_emb_dim=self.cat_emb_dim, - n_independent=self.n_independent, - n_shared=self.n_shared, - n_shared_decoder=self.n_shared_decoder, - n_indep_decoder=self.n_indep_decoder, - epsilon=self.epsilon, - virtual_batch_size=self.virtual_batch_size, - momentum=self.momentum, - mask_type=self.mask_type, - group_attention_matrix=self.group_matrix.to(self.device), - ).to(self.device) - - self.reducing_matrix = create_explain_matrix( - self.network.input_dim, - self.network.cat_emb_dim, - self.network.cat_idxs, - self.network.post_embed_dim, - ) - - def _update_network_params(self): - self.network.virtual_batch_size = self.virtual_batch_size - self.network.pretraining_ratio = self.pretraining_ratio - - def _set_metrics(self, eval_names): - """Set attributes relative to the metrics. - - Parameters - ---------- - metrics : list of str - List of eval metric names. - eval_names : list of str - List of eval set names. - - """ - metrics = [self._default_metric] - - metrics = check_metrics(metrics) - # Set metric container for each sets - self._metric_container_dict = {} - for name in eval_names: - self._metric_container_dict.update({name: UnsupMetricContainer(metrics, prefix=f"{name}_")}) - - self._metrics = [] - self._metrics_names = [] - for _, metric_container in self._metric_container_dict.items(): - self._metrics.extend(metric_container.metrics) - self._metrics_names.extend(metric_container.names) - - # Early stopping metric is the last eval metric - self.early_stopping_metric = self._metrics_names[-1] if len(self._metrics_names) > 0 else None - - def _construct_loaders(self, X_train, eval_set): - """Generate dataloaders for unsupervised train and eval set. - - Parameters - ---------- - X_train : np.array - Train set. - eval_set : list of tuple - List of eval tuple set (X, y). - - Returns - ------- - train_dataloader : `torch.utils.data.Dataloader` - Training dataloader. - valid_dataloaders : list of `torch.utils.data.Dataloader` - List of validation dataloaders. - - """ - train_dataloader, valid_dataloaders = create_dataloaders( - X_train, - eval_set, - self.updated_weights, - self.batch_size, - self.num_workers, - self.drop_last, - self.pin_memory, - ) - return train_dataloader, valid_dataloaders - - def _train_epoch(self, train_loader): - """ - Trains one epoch of the network in self.network - - Parameters - ---------- - train_loader : a :class: `torch.utils.data.Dataloader` - DataLoader with train set - """ - self.network.train() - - for batch_idx, X in enumerate(train_loader): - self._callback_container.on_batch_begin(batch_idx) - - batch_logs = self._train_batch(X) - - self._callback_container.on_batch_end(batch_idx, batch_logs) - - epoch_logs = {"lr": self._optimizer.param_groups[-1]["lr"]} - self.history.epoch_metrics.update(epoch_logs) - - return - - def _train_batch(self, X): - """ - Trains one batch of data - - Parameters - ---------- - X : torch.Tensor - Train matrix - - Returns - ------- - batch_outs : dict - Dictionnary with "y": target and "score": prediction scores. - batch_logs : dict - Dictionnary with "batch_size" and "loss". - """ - batch_logs = {"batch_size": X.shape[0]} - - X = X.to(self.device).float() - - for param in self.network.parameters(): - param.grad = None - - output, embedded_x, obf_vars = self.network(X) - loss = self.compute_loss(output, embedded_x, obf_vars) - - # Perform backward pass and optimization - loss.backward() - if self.clip_value: - clip_grad_norm_(self.network.parameters(), self.clip_value) - self._optimizer.step() - - batch_logs["loss"] = loss.cpu().detach().numpy().item() - - return batch_logs - - def _predict_epoch(self, name, loader): - """ - Predict an epoch and update metrics. - - Parameters - ---------- - name : str - Name of the validation set - loader : torch.utils.data.Dataloader - DataLoader with validation set - """ - # Setting network on evaluation mode - self.network.eval() - - list_output = [] - list_embedded_x = [] - list_obfuscation = [] - # Main loop - for batch_idx, X in enumerate(loader): - output, embedded_x, obf_vars = self._predict_batch(X) - list_output.append(output.cpu().detach().numpy()) - list_embedded_x.append(embedded_x.cpu().detach().numpy()) - list_obfuscation.append(obf_vars.cpu().detach().numpy()) - - output, embedded_x, obf_vars = self.stack_batches(list_output, list_embedded_x, list_obfuscation) - - metrics_logs = self._metric_container_dict[name](output, embedded_x, obf_vars) - self.network.train() - self.history.epoch_metrics.update(metrics_logs) - return - - def _predict_batch(self, X): - """ - Predict one batch of data. - - Parameters - ---------- - X : torch.Tensor - Owned products - - Returns - ------- - np.array - model scores - """ - X = X.to(self.device).float() - return self.network(X) - - def stack_batches(self, list_output, list_embedded_x, list_obfuscation): - output = np.vstack(list_output) - embedded_x = np.vstack(list_embedded_x) - obf_vars = np.vstack(list_obfuscation) - return output, embedded_x, obf_vars - - def predict(self, X): - """ - Make predictions on a batch (valid) - - Parameters - ---------- - X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` - Input data - - Returns - ------- - predictions : np.array - Predictions of the regression problem - """ - self.network.eval() - - if scipy.sparse.issparse(X): - dataloader = DataLoader( - SparsePredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - else: - dataloader = DataLoader( - PredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - - results = [] - embedded_res = [] - for batch_nb, data in enumerate(dataloader): - data = data.to(self.device).float() - output, embeded_x, _ = self.network(data) - predictions = output.cpu().detach().numpy() - results.append(predictions) - embedded_res.append(embeded_x.cpu().detach().numpy()) - res_output = np.vstack(results) - embedded_inputs = np.vstack(embedded_res) - return res_output, embedded_inputs diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py deleted file mode 100644 index d35e34f2..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/pretraining_utils.py +++ /dev/null @@ -1,119 +0,0 @@ -from torch.utils.data import DataLoader -from pytorch_tabnet.utils import create_sampler, SparsePredictDataset, PredictDataset, check_input -import scipy - - -def create_dataloaders(X_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory): - """ - Create dataloaders with or without subsampling depending on weights and balanced. - - Parameters - ---------- - X_train : np.ndarray or scipy.sparse.csr_matrix - Training data - eval_set : list of np.array (for Xs and ys) or scipy.sparse.csr_matrix (for Xs) - List of eval sets - weights : either 0, 1, dict or iterable - if 0 (default) : no weights will be applied - if 1 : classification only, will balanced class with inverse frequency - if dict : keys are corresponding class values are sample weights - if iterable : list or np array must be of length equal to nb elements - in the training set - batch_size : int - how many samples per batch to load - num_workers : int - how many subprocesses to use for data loading. 0 means that the data - will be loaded in the main process - drop_last : bool - set to True to drop the last incomplete batch, if the dataset size is not - divisible by the batch size. If False and the size of dataset is not - divisible by the batch size, then the last batch will be smaller - pin_memory : bool - Whether to pin GPU memory during training - - Returns - ------- - train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader - Training and validation dataloaders - """ - need_shuffle, sampler = create_sampler(weights, X_train) - - if scipy.sparse.issparse(X_train): - train_dataloader = DataLoader( - SparsePredictDataset(X_train), - batch_size=batch_size, - sampler=sampler, - shuffle=need_shuffle, - num_workers=num_workers, - drop_last=drop_last, - pin_memory=pin_memory, - ) - else: - train_dataloader = DataLoader( - PredictDataset(X_train), - batch_size=batch_size, - sampler=sampler, - shuffle=need_shuffle, - num_workers=num_workers, - drop_last=drop_last, - pin_memory=pin_memory, - ) - - valid_dataloaders = [] - for X in eval_set: - if scipy.sparse.issparse(X): - valid_dataloaders.append( - DataLoader( - SparsePredictDataset(X), - batch_size=batch_size, - sampler=sampler, - shuffle=need_shuffle, - num_workers=num_workers, - drop_last=drop_last, - pin_memory=pin_memory, - ) - ) - else: - valid_dataloaders.append( - DataLoader( - PredictDataset(X), - batch_size=batch_size, - sampler=sampler, - shuffle=need_shuffle, - num_workers=num_workers, - drop_last=drop_last, - pin_memory=pin_memory, - ) - ) - - return train_dataloader, valid_dataloaders - - -def validate_eval_set(eval_set, eval_name, X_train): - """Check if the shapes of eval_set are compatible with X_train. - - Parameters - ---------- - eval_set : List of numpy array - The list evaluation set. - The last one is used for early stopping - X_train : np.ndarray - Train owned products - - Returns - ------- - eval_names : list of str - Validated list of eval_names. - - """ - eval_names = eval_name or [f"val_{i}" for i in range(len(eval_set))] - assert len(eval_set) == len(eval_names), "eval_set and eval_name have not the same length" - - for set_nb, X in enumerate(eval_set): - check_input(X) - msg = ( - f"Number of columns is different between eval set {set_nb}" - + f"({X.shape[1]}) and X_train ({X_train.shape[1]})" - ) - assert X.shape[1] == X_train.shape[1], msg - return eval_names diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py deleted file mode 100644 index 53a71792..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/sparsemax.py +++ /dev/null @@ -1,276 +0,0 @@ -from torch import nn -from torch.autograd import Function -import torch.nn.functional as F - -import torch - -""" -Other possible implementations: -https://github.com/KrisKorrel/sparsemax-pytorch/blob/master/sparsemax.py -https://github.com/msobroza/SparsemaxPytorch/blob/master/mnist/sparsemax.py -https://github.com/vene/sparse-structured-attention/blob/master/pytorch/torchsparseattn/sparsemax.py -""" - - -# credits to Yandex https://github.com/Qwicen/node/blob/master/lib/nn_utils.py -def _make_ix_like(input, dim=0): - d = input.size(dim) - rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) - view = [1] * input.dim() - view[0] = -1 - return rho.view(view).transpose(0, dim) - - -class SparsemaxFunction(Function): - """ - An implementation of sparsemax (Martins & Astudillo, 2016). See - :cite:`DBLP:journals/corr/MartinsA16` for detailed description. - By Ben Peters and Vlad Niculae - """ - - @staticmethod - def forward(ctx, input, dim=-1): - """sparsemax: normalizing sparse transform (a la softmax) - - Parameters - ---------- - ctx : torch.autograd.function._ContextMethodMixin - input : torch.Tensor - any shape - dim : int - dimension along which to apply sparsemax - - Returns - ------- - output : torch.Tensor - same shape as input - - """ - ctx.dim = dim - max_val, _ = input.max(dim=dim, keepdim=True) - input -= max_val # same numerical stability trick as for softmax - tau, supp_size = SparsemaxFunction._threshold_and_support(input, dim=dim) - output = torch.clamp(input - tau, min=0) - ctx.save_for_backward(supp_size, output) - return output - - @staticmethod - def backward(ctx, grad_output): - supp_size, output = ctx.saved_tensors - dim = ctx.dim - grad_input = grad_output.clone() - grad_input[output == 0] = 0 - - v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() - v_hat = v_hat.unsqueeze(dim) - grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) - return grad_input, None - - @staticmethod - def _threshold_and_support(input, dim=-1): - """Sparsemax building block: compute the threshold - - Parameters - ---------- - input: torch.Tensor - any dimension - dim : int - dimension along which to apply the sparsemax - - Returns - ------- - tau : torch.Tensor - the threshold value - support_size : torch.Tensor - - """ - - input_srt, _ = torch.sort(input, descending=True, dim=dim) - input_cumsum = input_srt.cumsum(dim) - 1 - rhos = _make_ix_like(input, dim) - support = rhos * input_srt > input_cumsum - - support_size = support.sum(dim=dim).unsqueeze(dim) - tau = input_cumsum.gather(dim, support_size - 1) - tau /= support_size.to(input.dtype) - return tau, support_size - - -sparsemax = SparsemaxFunction.apply - - -class Sparsemax(nn.Module): - def __init__(self, dim=-1): - self.dim = dim - super(Sparsemax, self).__init__() - - def forward(self, input): - return sparsemax(input, self.dim) - - -class Entmax15Function(Function): - """ - An implementation of exact Entmax with alpha=1.5 (B. Peters, V. Niculae, A. Martins). See - :cite:`https://arxiv.org/abs/1905.05702 for detailed description. - Source: https://github.com/deep-spin/entmax - """ - - @staticmethod - def forward(ctx, input, dim=-1): - ctx.dim = dim - - max_val, _ = input.max(dim=dim, keepdim=True) - input = input - max_val # same numerical stability trick as for softmax - input = input / 2 # divide by 2 to solve actual Entmax - - tau_star, _ = Entmax15Function._threshold_and_support(input, dim) - output = torch.clamp(input - tau_star, min=0) ** 2 - ctx.save_for_backward(output) - return output - - @staticmethod - def backward(ctx, grad_output): - (Y,) = ctx.saved_tensors - gppr = Y.sqrt() # = 1 / g'' (Y) - dX = grad_output * gppr - q = dX.sum(ctx.dim) / gppr.sum(ctx.dim) - q = q.unsqueeze(ctx.dim) - dX -= q * gppr - return dX, None - - @staticmethod - def _threshold_and_support(input, dim=-1): - Xsrt, _ = torch.sort(input, descending=True, dim=dim) - - rho = _make_ix_like(input, dim) - mean = Xsrt.cumsum(dim) / rho - mean_sq = (Xsrt ** 2).cumsum(dim) / rho - ss = rho * (mean_sq - mean ** 2) - delta = (1 - ss) / rho - - # NOTE this is not exactly the same as in reference algo - # Fortunately it seems the clamped values never wrongly - # get selected by tau <= sorted_z. Prove this! - delta_nz = torch.clamp(delta, 0) - tau = mean - torch.sqrt(delta_nz) - - support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim) - tau_star = tau.gather(dim, support_size - 1) - return tau_star, support_size - - -class Entmoid15(Function): - """ A highly optimized equivalent of lambda x: Entmax15([x, 0]) """ - - @staticmethod - def forward(ctx, input): - output = Entmoid15._forward(input) - ctx.save_for_backward(output) - return output - - @staticmethod - def _forward(input): - input, is_pos = abs(input), input >= 0 - tau = (input + torch.sqrt(F.relu(8 - input ** 2))) / 2 - tau.masked_fill_(tau <= input, 2.0) - y_neg = 0.25 * F.relu(tau - input, inplace=True) ** 2 - return torch.where(is_pos, 1 - y_neg, y_neg) - - @staticmethod - def backward(ctx, grad_output): - return Entmoid15._backward(ctx.saved_tensors[0], grad_output) - - @staticmethod - def _backward(output, grad_output): - gppr0, gppr1 = output.sqrt(), (1 - output).sqrt() - grad_input = grad_output * gppr0 - q = grad_input / (gppr0 + gppr1) - grad_input -= q * gppr0 - return grad_input - - -entmax15 = Entmax15Function.apply -entmoid15 = Entmoid15.apply - - -class Entmax15(nn.Module): - def __init__(self, dim=-1): - self.dim = dim - super(Entmax15, self).__init__() - - def forward(self, input): - return entmax15(input, self.dim) - - -# Credits were lost... -# def _make_ix_like(input, dim=0): -# d = input.size(dim) -# rho = torch.arange(1, d + 1, device=input.device, dtype=input.dtype) -# view = [1] * input.dim() -# view[0] = -1 -# return rho.view(view).transpose(0, dim) -# -# -# def _threshold_and_support(input, dim=0): -# """Sparsemax building block: compute the threshold -# Args: -# input: any dimension -# dim: dimension along which to apply the sparsemax -# Returns: -# the threshold value -# """ -# -# input_srt, _ = torch.sort(input, descending=True, dim=dim) -# input_cumsum = input_srt.cumsum(dim) - 1 -# rhos = _make_ix_like(input, dim) -# support = rhos * input_srt > input_cumsum -# -# support_size = support.sum(dim=dim).unsqueeze(dim) -# tau = input_cumsum.gather(dim, support_size - 1) -# tau /= support_size.to(input.dtype) -# return tau, support_size -# -# -# class SparsemaxFunction(Function): -# -# @staticmethod -# def forward(ctx, input, dim=0): -# """sparsemax: normalizing sparse transform (a la softmax) -# Parameters: -# input (Tensor): any shape -# dim: dimension along which to apply sparsemax -# Returns: -# output (Tensor): same shape as input -# """ -# ctx.dim = dim -# max_val, _ = input.max(dim=dim, keepdim=True) -# input -= max_val # same numerical stability trick as for softmax -# tau, supp_size = _threshold_and_support(input, dim=dim) -# output = torch.clamp(input - tau, min=0) -# ctx.save_for_backward(supp_size, output) -# return output -# -# @staticmethod -# def backward(ctx, grad_output): -# supp_size, output = ctx.saved_tensors -# dim = ctx.dim -# grad_input = grad_output.clone() -# grad_input[output == 0] = 0 -# -# v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze() -# v_hat = v_hat.unsqueeze(dim) -# grad_input = torch.where(output != 0, grad_input - v_hat, grad_input) -# return grad_input, None -# -# -# sparsemax = SparsemaxFunction.apply -# -# -# class Sparsemax(nn.Module): -# -# def __init__(self, dim=0): -# self.dim = dim -# super(Sparsemax, self).__init__() -# -# def forward(self, input): -# return sparsemax(input, self.dim) diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py deleted file mode 100755 index 32115c8c..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_model.py +++ /dev/null @@ -1,146 +0,0 @@ -import torch -import numpy as np -from scipy.special import softmax -from pytorch_tabnet.utils import SparsePredictDataset, PredictDataset, filter_weights -from pytorch_tabnet.abstract_model import TabModel -from pytorch_tabnet.multiclass_utils import infer_output_dim, check_output_dim -from torch.utils.data import DataLoader -import scipy - - -class TabNetClassifier(TabModel): - def __post_init__(self): - super(TabNetClassifier, self).__post_init__() - self._task = "classification" - self._default_loss = torch.nn.functional.cross_entropy - self._default_metric = "accuracy" - - def weight_updater(self, weights): - """ - Updates weights dictionary according to target_mapper. - - Parameters - ---------- - weights : bool or dict - Given weights for balancing training. - - Returns - ------- - bool or dict - Same bool if weights are bool, updated dict otherwise. - - """ - if isinstance(weights, int): - return weights - elif isinstance(weights, dict): - return {self.target_mapper[key]: value for key, value in weights.items()} - else: - return weights - - def prepare_target(self, y): - return np.vectorize(self.target_mapper.get)(y) - - def compute_loss(self, y_pred, y_true): - return self.loss_fn(y_pred, y_true.long()) - - def update_fit_params( - self, - X_train, - y_train, - eval_set, - weights, - ): - output_dim, train_labels = infer_output_dim(y_train) - for X, y in eval_set: - check_output_dim(train_labels, y) - self.output_dim = output_dim - self._default_metric = "auc" if self.output_dim == 2 else "accuracy" - self.classes_ = train_labels - self.target_mapper = {class_label: index for index, class_label in enumerate(self.classes_)} - self.preds_mapper = {str(index): class_label for index, class_label in enumerate(self.classes_)} - self.updated_weights = self.weight_updater(weights) - - def stack_batches(self, list_y_true, list_y_score): - y_true = np.hstack(list_y_true) - y_score = np.vstack(list_y_score) - y_score = softmax(y_score, axis=1) - return y_true, y_score - - def predict_func(self, outputs): - outputs = np.argmax(outputs, axis=1) - return np.vectorize(self.preds_mapper.get)(outputs.astype(str)) - - def predict_proba(self, X): - """ - Make predictions for classification on a batch (valid) - - Parameters - ---------- - X : a :tensor: `torch.Tensor` or matrix: `scipy.sparse.csr_matrix` - Input data - - Returns - ------- - res : np.ndarray - - """ - self.network.eval() - - if scipy.sparse.issparse(X): - dataloader = DataLoader( - SparsePredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - else: - dataloader = DataLoader( - PredictDataset(X), - batch_size=self.batch_size, - shuffle=False, - ) - - results = [] - for batch_nb, data in enumerate(dataloader): - data = data.to(self.device).float() - - output, M_loss = self.network(data) - predictions = torch.nn.Softmax(dim=1)(output).cpu().detach().numpy() - results.append(predictions) - res = np.vstack(results) - return res - - -class TabNetRegressor(TabModel): - def __post_init__(self): - super(TabNetRegressor, self).__post_init__() - self._task = "regression" - self._default_loss = torch.nn.functional.mse_loss - self._default_metric = "mse" - - def prepare_target(self, y): - return y - - def compute_loss(self, y_pred, y_true): - return self.loss_fn(y_pred, y_true) - - def update_fit_params(self, X_train, y_train, eval_set, weights): - if len(y_train.shape) != 2: - msg = ( - "Targets should be 2D : (n_samples, n_regression) " - + f"but y_train.shape={y_train.shape} given.\n" - + "Use reshape(-1, 1) for single regression." - ) - raise ValueError(msg) - self.output_dim = y_train.shape[1] - self.preds_mapper = None - - self.updated_weights = weights - filter_weights(self.updated_weights) - - def predict_func(self, outputs): - return outputs - - def stack_batches(self, list_y_true, list_y_score): - y_true = np.vstack(list_y_true) - y_score = np.vstack(list_y_score) - return y_true, y_score diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py deleted file mode 100644 index 4cc67f55..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/tab_network.py +++ /dev/null @@ -1,908 +0,0 @@ -import torch -from torch.nn import Linear, BatchNorm1d, ReLU -import numpy as np -from pytorch_tabnet import sparsemax - - -def initialize_non_glu(module, input_dim, output_dim): - gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(4 * input_dim)) - torch.nn.init.xavier_normal_(module.weight, gain=gain_value) - # torch.nn.init.zeros_(module.bias) - return - - -def initialize_glu(module, input_dim, output_dim): - gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(input_dim)) - torch.nn.init.xavier_normal_(module.weight, gain=gain_value) - # torch.nn.init.zeros_(module.bias) - return - - -class GBN(torch.nn.Module): - """ - Ghost Batch Normalization - https://arxiv.org/abs/1705.08741 - """ - - def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01): - super(GBN, self).__init__() - - self.input_dim = input_dim - self.virtual_batch_size = virtual_batch_size - self.bn = BatchNorm1d(self.input_dim, momentum=momentum) - - def forward(self, x): - chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0) - res = [self.bn(x_) for x_ in chunks] - - return torch.cat(res, dim=0) - - -class TabNetEncoder(torch.nn.Module): - def __init__( - self, - input_dim, - output_dim, - n_d=8, - n_a=8, - n_steps=3, - gamma=1.3, - n_independent=2, - n_shared=2, - epsilon=1e-15, - virtual_batch_size=128, - momentum=0.02, - mask_type="sparsemax", - group_attention_matrix=None, - ): - """ - Defines main part of the TabNet network without the embedding layers. - - Parameters - ---------- - input_dim : int - Number of features - output_dim : int or list of int for multi task classification - Dimension of network output - examples : one for regression, 2 for binary classification etc... - n_d : int - Dimension of the prediction layer (usually between 4 and 64) - n_a : int - Dimension of the attention layer (usually between 4 and 64) - n_steps : int - Number of successive steps in the network (usually between 3 and 10) - gamma : float - Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0) - n_independent : int - Number of independent GLU layer in each GLU block (default 2) - n_shared : int - Number of independent GLU layer in each GLU block (default 2) - epsilon : float - Avoid log(0), this should be kept very low - virtual_batch_size : int - Batch size for Ghost Batch Normalization - momentum : float - Float value between 0 and 1 which will be used for momentum in all batch norm - mask_type : str - Either "sparsemax" or "entmax" : this is the masking function to use - group_attention_matrix : torch matrix - Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j - """ - super(TabNetEncoder, self).__init__() - self.input_dim = input_dim - self.output_dim = output_dim - self.is_multi_task = isinstance(output_dim, list) - self.n_d = n_d - self.n_a = n_a - self.n_steps = n_steps - self.gamma = gamma - self.epsilon = epsilon - self.n_independent = n_independent - self.n_shared = n_shared - self.virtual_batch_size = virtual_batch_size - self.mask_type = mask_type - self.initial_bn = BatchNorm1d(self.input_dim, momentum=0.01) - self.group_attention_matrix = group_attention_matrix - - if self.group_attention_matrix is None: - # no groups - self.group_attention_matrix = torch.eye(self.input_dim) - self.attention_dim = self.input_dim - else: - self.attention_dim = self.group_attention_matrix.shape[0] - - if self.n_shared > 0: - shared_feat_transform = torch.nn.ModuleList() - for i in range(self.n_shared): - if i == 0: - shared_feat_transform.append(Linear(self.input_dim, 2 * (n_d + n_a), bias=False)) - else: - shared_feat_transform.append(Linear(n_d + n_a, 2 * (n_d + n_a), bias=False)) - - else: - shared_feat_transform = None - - self.initial_splitter = FeatTransformer( - self.input_dim, - n_d + n_a, - shared_feat_transform, - n_glu_independent=self.n_independent, - virtual_batch_size=self.virtual_batch_size, - momentum=momentum, - ) - - self.feat_transformers = torch.nn.ModuleList() - self.att_transformers = torch.nn.ModuleList() - - for step in range(n_steps): - transformer = FeatTransformer( - self.input_dim, - n_d + n_a, - shared_feat_transform, - n_glu_independent=self.n_independent, - virtual_batch_size=self.virtual_batch_size, - momentum=momentum, - ) - attention = AttentiveTransformer( - n_a, - self.attention_dim, - group_matrix=group_attention_matrix, - virtual_batch_size=self.virtual_batch_size, - momentum=momentum, - mask_type=self.mask_type, - ) - self.feat_transformers.append(transformer) - self.att_transformers.append(attention) - - def forward(self, x, prior=None): - x = self.initial_bn(x) - - bs = x.shape[0] # batch size - if prior is None: - prior = torch.ones((bs, self.attention_dim)).to(x.device) - - M_loss = 0 - att = self.initial_splitter(x)[:, self.n_d :] - steps_output = [] - for step in range(self.n_steps): - M = self.att_transformers[step](prior, att) - M_loss += torch.mean(torch.sum(torch.mul(M, torch.log(M + self.epsilon)), dim=1)) - # update prior - prior = torch.mul(self.gamma - M, prior) - # output - M_feature_level = torch.matmul(M, self.group_attention_matrix) - masked_x = torch.mul(M_feature_level, x) - out = self.feat_transformers[step](masked_x) - d = ReLU()(out[:, : self.n_d]) - steps_output.append(d) - # update attention - att = out[:, self.n_d :] - - M_loss /= self.n_steps - return steps_output, M_loss - - def forward_masks(self, x): - x = self.initial_bn(x) - bs = x.shape[0] # batch size - prior = torch.ones((bs, self.attention_dim)).to(x.device) - M_explain = torch.zeros(x.shape).to(x.device) - att = self.initial_splitter(x)[:, self.n_d :] - masks = {} - - for step in range(self.n_steps): - M = self.att_transformers[step](prior, att) - M_feature_level = torch.matmul(M, self.group_attention_matrix) - masks[step] = M_feature_level - # update prior - prior = torch.mul(self.gamma - M, prior) - # output - masked_x = torch.mul(M_feature_level, x) - out = self.feat_transformers[step](masked_x) - d = ReLU()(out[:, : self.n_d]) - # explain - step_importance = torch.sum(d, dim=1) - M_explain += torch.mul(M_feature_level, step_importance.unsqueeze(dim=1)) - # update attention - att = out[:, self.n_d :] - - return M_explain, masks - - -class TabNetDecoder(torch.nn.Module): - def __init__( - self, - input_dim, - n_d=8, - n_steps=3, - n_independent=1, - n_shared=1, - virtual_batch_size=128, - momentum=0.02, - ): - """ - Defines main part of the TabNet network without the embedding layers. - - Parameters - ---------- - input_dim : int - Number of features - output_dim : int or list of int for multi task classification - Dimension of network output - examples : one for regression, 2 for binary classification etc... - n_d : int - Dimension of the prediction layer (usually between 4 and 64) - n_steps : int - Number of successive steps in the network (usually between 3 and 10) - gamma : float - Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0) - n_independent : int - Number of independent GLU layer in each GLU block (default 1) - n_shared : int - Number of independent GLU layer in each GLU block (default 1) - virtual_batch_size : int - Batch size for Ghost Batch Normalization - momentum : float - Float value between 0 and 1 which will be used for momentum in all batch norm - """ - super(TabNetDecoder, self).__init__() - self.input_dim = input_dim - self.n_d = n_d - self.n_steps = n_steps - self.n_independent = n_independent - self.n_shared = n_shared - self.virtual_batch_size = virtual_batch_size - - self.feat_transformers = torch.nn.ModuleList() - - if self.n_shared > 0: - shared_feat_transform = torch.nn.ModuleList() - for i in range(self.n_shared): - shared_feat_transform.append(Linear(n_d, 2 * n_d, bias=False)) - else: - shared_feat_transform = None - - for step in range(n_steps): - transformer = FeatTransformer( - n_d, - n_d, - shared_feat_transform, - n_glu_independent=self.n_independent, - virtual_batch_size=self.virtual_batch_size, - momentum=momentum, - ) - self.feat_transformers.append(transformer) - - self.reconstruction_layer = Linear(n_d, self.input_dim, bias=False) - initialize_non_glu(self.reconstruction_layer, n_d, self.input_dim) - - def forward(self, steps_output): - res = 0 - for step_nb, step_output in enumerate(steps_output): - x = self.feat_transformers[step_nb](step_output) - res = torch.add(res, x) - res = self.reconstruction_layer(res) - return res - - -class TabNetPretraining(torch.nn.Module): - def __init__( - self, - input_dim, - pretraining_ratio=0.2, - n_d=8, - n_a=8, - n_steps=3, - gamma=1.3, - cat_idxs=[], - cat_dims=[], - cat_emb_dim=1, - n_independent=2, - n_shared=2, - epsilon=1e-15, - virtual_batch_size=128, - momentum=0.02, - mask_type="sparsemax", - n_shared_decoder=1, - n_indep_decoder=1, - group_attention_matrix=None, - ): - super(TabNetPretraining, self).__init__() - - self.cat_idxs = cat_idxs or [] - self.cat_dims = cat_dims or [] - self.cat_emb_dim = cat_emb_dim - - self.input_dim = input_dim - self.n_d = n_d - self.n_a = n_a - self.n_steps = n_steps - self.gamma = gamma - self.epsilon = epsilon - self.n_independent = n_independent - self.n_shared = n_shared - self.mask_type = mask_type - self.pretraining_ratio = pretraining_ratio - self.n_shared_decoder = n_shared_decoder - self.n_indep_decoder = n_indep_decoder - - if self.n_steps <= 0: - raise ValueError("n_steps should be a positive integer.") - if self.n_independent == 0 and self.n_shared == 0: - raise ValueError("n_shared and n_independent can't be both zero.") - - self.virtual_batch_size = virtual_batch_size - self.embedder = EmbeddingGenerator(input_dim, cat_dims, cat_idxs, cat_emb_dim, group_attention_matrix) - self.post_embed_dim = self.embedder.post_embed_dim - - self.masker = RandomObfuscator(self.pretraining_ratio, group_matrix=self.embedder.embedding_group_matrix) - self.encoder = TabNetEncoder( - input_dim=self.post_embed_dim, - output_dim=self.post_embed_dim, - n_d=n_d, - n_a=n_a, - n_steps=n_steps, - gamma=gamma, - n_independent=n_independent, - n_shared=n_shared, - epsilon=epsilon, - virtual_batch_size=virtual_batch_size, - momentum=momentum, - mask_type=mask_type, - group_attention_matrix=self.embedder.embedding_group_matrix, - ) - self.decoder = TabNetDecoder( - self.post_embed_dim, - n_d=n_d, - n_steps=n_steps, - n_independent=self.n_indep_decoder, - n_shared=self.n_shared_decoder, - virtual_batch_size=virtual_batch_size, - momentum=momentum, - ) - - def forward(self, x): - """ - Returns: res, embedded_x, obf_vars - res : output of reconstruction - embedded_x : embedded input - obf_vars : which variable where obfuscated - """ - embedded_x = self.embedder(x) - if self.training: - masked_x, obfuscated_groups, obfuscated_vars = self.masker(embedded_x) - # set prior of encoder with obfuscated groups - prior = 1 - obfuscated_groups - steps_out, _ = self.encoder(masked_x, prior=prior) - res = self.decoder(steps_out) - return res, embedded_x, obfuscated_vars - else: - steps_out, _ = self.encoder(embedded_x) - res = self.decoder(steps_out) - return res, embedded_x, torch.ones(embedded_x.shape).to(x.device) - - def forward_masks(self, x): - embedded_x = self.embedder(x) - return self.encoder.forward_masks(embedded_x) - - -class TabNetNoEmbeddings(torch.nn.Module): - def __init__( - self, - input_dim, - output_dim, - n_d=8, - n_a=8, - n_steps=3, - gamma=1.3, - n_independent=2, - n_shared=2, - epsilon=1e-15, - virtual_batch_size=128, - momentum=0.02, - mask_type="sparsemax", - group_attention_matrix=None, - ): - """ - Defines main part of the TabNet network without the embedding layers. - - Parameters - ---------- - input_dim : int - Number of features - output_dim : int or list of int for multi task classification - Dimension of network output - examples : one for regression, 2 for binary classification etc... - n_d : int - Dimension of the prediction layer (usually between 4 and 64) - n_a : int - Dimension of the attention layer (usually between 4 and 64) - n_steps : int - Number of successive steps in the network (usually between 3 and 10) - gamma : float - Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0) - n_independent : int - Number of independent GLU layer in each GLU block (default 2) - n_shared : int - Number of independent GLU layer in each GLU block (default 2) - epsilon : float - Avoid log(0), this should be kept very low - virtual_batch_size : int - Batch size for Ghost Batch Normalization - momentum : float - Float value between 0 and 1 which will be used for momentum in all batch norm - mask_type : str - Either "sparsemax" or "entmax" : this is the masking function to use - group_attention_matrix : torch matrix - Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j - """ - super(TabNetNoEmbeddings, self).__init__() - self.input_dim = input_dim - self.output_dim = output_dim - self.is_multi_task = isinstance(output_dim, list) - self.n_d = n_d - self.n_a = n_a - self.n_steps = n_steps - self.gamma = gamma - self.epsilon = epsilon - self.n_independent = n_independent - self.n_shared = n_shared - self.virtual_batch_size = virtual_batch_size - self.mask_type = mask_type - self.initial_bn = BatchNorm1d(self.input_dim, momentum=0.01) - - self.encoder = TabNetEncoder( - input_dim=input_dim, - output_dim=output_dim, - n_d=n_d, - n_a=n_a, - n_steps=n_steps, - gamma=gamma, - n_independent=n_independent, - n_shared=n_shared, - epsilon=epsilon, - virtual_batch_size=virtual_batch_size, - momentum=momentum, - mask_type=mask_type, - group_attention_matrix=group_attention_matrix, - ) - - if self.is_multi_task: - self.multi_task_mappings = torch.nn.ModuleList() - for task_dim in output_dim: - task_mapping = Linear(n_d, task_dim, bias=False) - initialize_non_glu(task_mapping, n_d, task_dim) - self.multi_task_mappings.append(task_mapping) - else: - self.final_mapping = Linear(n_d, output_dim, bias=False) - initialize_non_glu(self.final_mapping, n_d, output_dim) - - def forward(self, x): - res = 0 - steps_output, M_loss = self.encoder(x) - res = torch.sum(torch.stack(steps_output, dim=0), dim=0) - - if self.is_multi_task: - # Result will be in list format - out = [] - for task_mapping in self.multi_task_mappings: - out.append(task_mapping(res)) - else: - out = self.final_mapping(res) - return out, M_loss - - def forward_masks(self, x): - return self.encoder.forward_masks(x) - - -class TabNet(torch.nn.Module): - def __init__( - self, - input_dim, - output_dim, - n_d=8, - n_a=8, - n_steps=3, - gamma=1.3, - cat_idxs=[], - cat_dims=[], - cat_emb_dim=1, - n_independent=2, - n_shared=2, - epsilon=1e-15, - virtual_batch_size=128, - momentum=0.02, - mask_type="sparsemax", - group_attention_matrix=[], - ): - """ - Defines TabNet network - - Parameters - ---------- - input_dim : int - Initial number of features - output_dim : int - Dimension of network output - examples : one for regression, 2 for binary classification etc... - n_d : int - Dimension of the prediction layer (usually between 4 and 64) - n_a : int - Dimension of the attention layer (usually between 4 and 64) - n_steps : int - Number of successive steps in the network (usually between 3 and 10) - gamma : float - Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0) - cat_idxs : list of int - Index of each categorical column in the dataset - cat_dims : list of int - Number of categories in each categorical column - cat_emb_dim : int or list of int - Size of the embedding of categorical features - if int, all categorical features will have same embedding size - if list of int, every corresponding feature will have specific size - n_independent : int - Number of independent GLU layer in each GLU block (default 2) - n_shared : int - Number of independent GLU layer in each GLU block (default 2) - epsilon : float - Avoid log(0), this should be kept very low - virtual_batch_size : int - Batch size for Ghost Batch Normalization - momentum : float - Float value between 0 and 1 which will be used for momentum in all batch norm - mask_type : str - Either "sparsemax" or "entmax" : this is the masking function to use - group_attention_matrix : torch matrix - Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j - """ - super(TabNet, self).__init__() - self.cat_idxs = cat_idxs or [] - self.cat_dims = cat_dims or [] - self.cat_emb_dim = cat_emb_dim - - self.input_dim = input_dim - self.output_dim = output_dim - self.n_d = n_d - self.n_a = n_a - self.n_steps = n_steps - self.gamma = gamma - self.epsilon = epsilon - self.n_independent = n_independent - self.n_shared = n_shared - self.mask_type = mask_type - - if self.n_steps <= 0: - raise ValueError("n_steps should be a positive integer.") - if self.n_independent == 0 and self.n_shared == 0: - raise ValueError("n_shared and n_independent can't be both zero.") - - self.virtual_batch_size = virtual_batch_size - self.embedder = EmbeddingGenerator(input_dim, cat_dims, cat_idxs, cat_emb_dim, group_attention_matrix) - self.post_embed_dim = self.embedder.post_embed_dim - - self.tabnet = TabNetNoEmbeddings( - self.post_embed_dim, - output_dim, - n_d, - n_a, - n_steps, - gamma, - n_independent, - n_shared, - epsilon, - virtual_batch_size, - momentum, - mask_type, - self.embedder.embedding_group_matrix, - ) - - def forward(self, x): - x = self.embedder(x) - return self.tabnet(x) - - def forward_masks(self, x): - x = self.embedder(x) - return self.tabnet.forward_masks(x) - - -class AttentiveTransformer(torch.nn.Module): - def __init__( - self, - input_dim, - group_dim, - group_matrix, - virtual_batch_size=128, - momentum=0.02, - mask_type="sparsemax", - ): - """ - Initialize an attention transformer. - - Parameters - ---------- - input_dim : int - Input size - group_dim : int - Number of groups for features - virtual_batch_size : int - Batch size for Ghost Batch Normalization - momentum : float - Float value between 0 and 1 which will be used for momentum in batch norm - mask_type : str - Either "sparsemax" or "entmax" : this is the masking function to use - """ - super(AttentiveTransformer, self).__init__() - self.fc = Linear(input_dim, group_dim, bias=False) - initialize_non_glu(self.fc, input_dim, group_dim) - self.bn = GBN(group_dim, virtual_batch_size=virtual_batch_size, momentum=momentum) - - if mask_type == "sparsemax": - # Sparsemax - self.selector = sparsemax.Sparsemax(dim=-1) - elif mask_type == "entmax": - # Entmax - self.selector = sparsemax.Entmax15(dim=-1) - else: - raise NotImplementedError("Please choose either sparsemax" + "or entmax as masktype") - - def forward(self, priors, processed_feat): - x = self.fc(processed_feat) - x = self.bn(x) - x = torch.mul(x, priors) - x = self.selector(x) - return x - - -class FeatTransformer(torch.nn.Module): - def __init__( - self, - input_dim, - output_dim, - shared_layers, - n_glu_independent, - virtual_batch_size=128, - momentum=0.02, - ): - super(FeatTransformer, self).__init__() - """ - Initialize a feature transformer. - - Parameters - ---------- - input_dim : int - Input size - output_dim : int - Output_size - shared_layers : torch.nn.ModuleList - The shared block that should be common to every step - n_glu_independent : int - Number of independent GLU layers - virtual_batch_size : int - Batch size for Ghost Batch Normalization within GLU block(s) - momentum : float - Float value between 0 and 1 which will be used for momentum in batch norm - """ - - params = { - "n_glu": n_glu_independent, - "virtual_batch_size": virtual_batch_size, - "momentum": momentum, - } - - if shared_layers is None: - # no shared layers - self.shared = torch.nn.Identity() - is_first = True - else: - self.shared = GLU_Block( - input_dim, - output_dim, - first=True, - shared_layers=shared_layers, - n_glu=len(shared_layers), - virtual_batch_size=virtual_batch_size, - momentum=momentum, - ) - is_first = False - - if n_glu_independent == 0: - # no independent layers - self.specifics = torch.nn.Identity() - else: - spec_input_dim = input_dim if is_first else output_dim - self.specifics = GLU_Block(spec_input_dim, output_dim, first=is_first, **params) - - def forward(self, x): - x = self.shared(x) - x = self.specifics(x) - return x - - -class GLU_Block(torch.nn.Module): - """ - Independent GLU block, specific to each step - """ - - def __init__( - self, - input_dim, - output_dim, - n_glu=2, - first=False, - shared_layers=None, - virtual_batch_size=128, - momentum=0.02, - ): - super(GLU_Block, self).__init__() - self.first = first - self.shared_layers = shared_layers - self.n_glu = n_glu - self.glu_layers = torch.nn.ModuleList() - - params = {"virtual_batch_size": virtual_batch_size, "momentum": momentum} - - fc = shared_layers[0] if shared_layers else None - self.glu_layers.append(GLU_Layer(input_dim, output_dim, fc=fc, **params)) - for glu_id in range(1, self.n_glu): - fc = shared_layers[glu_id] if shared_layers else None - self.glu_layers.append(GLU_Layer(output_dim, output_dim, fc=fc, **params)) - - def forward(self, x): - scale = torch.sqrt(torch.FloatTensor([0.5]).to(x.device)) - if self.first: # the first layer of the block has no scale multiplication - x = self.glu_layers[0](x) - layers_left = range(1, self.n_glu) - else: - layers_left = range(self.n_glu) - - for glu_id in layers_left: - x = torch.add(x, self.glu_layers[glu_id](x)) - x = x * scale - return x - - -class GLU_Layer(torch.nn.Module): - def __init__(self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02): - super(GLU_Layer, self).__init__() - - self.output_dim = output_dim - if fc: - self.fc = fc - else: - self.fc = Linear(input_dim, 2 * output_dim, bias=False) - initialize_glu(self.fc, input_dim, 2 * output_dim) - - self.bn = GBN(2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum) - - def forward(self, x): - x = self.fc(x) - x = self.bn(x) - out = torch.mul(x[:, : self.output_dim], torch.sigmoid(x[:, self.output_dim :])) - return out - - -class EmbeddingGenerator(torch.nn.Module): - """ - Classical embeddings generator - """ - - def __init__(self, input_dim, cat_dims, cat_idxs, cat_emb_dims, group_matrix): - """This is an embedding module for an entire set of features - - Parameters - ---------- - input_dim : int - Number of features coming as input (number of columns) - cat_dims : list of int - Number of modalities for each categorial features - If the list is empty, no embeddings will be done - cat_idxs : list of int - Positional index for each categorical features in inputs - cat_emb_dim : list of int - Embedding dimension for each categorical features - If int, the same embedding dimension will be used for all categorical features - group_matrix : torch matrix - Original group matrix before embeddings - """ - super(EmbeddingGenerator, self).__init__() - - if cat_dims == [] and cat_idxs == []: - self.skip_embedding = True - self.post_embed_dim = input_dim - self.embedding_group_matrix = group_matrix.to(group_matrix.device) - return - else: - self.skip_embedding = False - - self.post_embed_dim = int(input_dim + np.sum(cat_emb_dims) - len(cat_emb_dims)) - - self.embeddings = torch.nn.ModuleList() - - for cat_dim, emb_dim in zip(cat_dims, cat_emb_dims): - self.embeddings.append(torch.nn.Embedding(cat_dim, emb_dim)) - - # record continuous indices - self.continuous_idx = torch.ones(input_dim, dtype=torch.bool) - self.continuous_idx[cat_idxs] = 0 - - # update group matrix - n_groups = group_matrix.shape[0] - self.embedding_group_matrix = torch.empty((n_groups, self.post_embed_dim), device=group_matrix.device) - for group_idx in range(n_groups): - post_emb_idx = 0 - cat_feat_counter = 0 - for init_feat_idx in range(input_dim): - if self.continuous_idx[init_feat_idx] == 1: - # this means that no embedding is applied to this column - self.embedding_group_matrix[group_idx, post_emb_idx] = group_matrix[ - group_idx, init_feat_idx - ] # noqa - post_emb_idx += 1 - else: - # this is a categorical feature which creates multiple embeddings - n_embeddings = cat_emb_dims[cat_feat_counter] - self.embedding_group_matrix[group_idx, post_emb_idx : post_emb_idx + n_embeddings] = ( - group_matrix[group_idx, init_feat_idx] / n_embeddings - ) # noqa - post_emb_idx += n_embeddings - cat_feat_counter += 1 - - def forward(self, x): - """ - Apply embeddings to inputs - Inputs should be (batch_size, input_dim) - Outputs will be of size (batch_size, self.post_embed_dim) - """ - if self.skip_embedding: - # no embeddings required - return x - - cols = [] - cat_feat_counter = 0 - for feat_init_idx, is_continuous in enumerate(self.continuous_idx): - # Enumerate through continuous idx boolean mask to apply embeddings - if is_continuous: - cols.append(x[:, feat_init_idx].float().view(-1, 1)) - else: - cols.append(self.embeddings[cat_feat_counter](x[:, feat_init_idx].long())) - cat_feat_counter += 1 - # concat - post_embeddings = torch.cat(cols, dim=1) - return post_embeddings - - -class RandomObfuscator(torch.nn.Module): - """ - Create and applies obfuscation masks. - The obfuscation is done at group level to match attention. - """ - - def __init__(self, pretraining_ratio, group_matrix): - """ - This create random obfuscation for self suppervised pretraining - Parameters - ---------- - pretraining_ratio : float - Ratio of feature to randomly discard for reconstruction - - """ - super(RandomObfuscator, self).__init__() - self.pretraining_ratio = pretraining_ratio - # group matrix is set to boolean here to pass all posssible information - self.group_matrix = (group_matrix > 0) + 0.0 - self.num_groups = group_matrix.shape[0] - - def forward(self, x): - """ - Generate random obfuscation mask. - - Returns - ------- - masked input and obfuscated variables. - """ - bs = x.shape[0] - - obfuscated_groups = torch.bernoulli(self.pretraining_ratio * torch.ones((bs, self.num_groups), device=x.device)) - obfuscated_vars = torch.matmul(obfuscated_groups, self.group_matrix) - masked_input = torch.mul(1 - obfuscated_vars, x) - return masked_input, obfuscated_groups, obfuscated_vars diff --git a/lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py b/lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py deleted file mode 100644 index 52d15a72..00000000 --- a/lightautoml/ml_algo/torch_based/pytorch_tabnet/utils.py +++ /dev/null @@ -1,529 +0,0 @@ -from torch.utils.data import Dataset -from torch.utils.data import DataLoader, WeightedRandomSampler -import torch -import numpy as np -import scipy -import json -from sklearn.utils import check_array -import pandas as pd -import warnings - - -class TorchDataset(Dataset): - """ - Format for numpy array - - Parameters - ---------- - X : 2D array - The input matrix - y : 2D array - The one-hot encoded target - """ - - def __init__(self, x, y): - self.x = x - self.y = y - - def __len__(self): - return len(self.x) - - def __getitem__(self, index): - x, y = self.x[index], self.y[index] - return x, y - - -class SparseTorchDataset(Dataset): - """ - Format for csr_matrix - - Parameters - ---------- - X : CSR matrix - The input matrix - y : 2D array - The one-hot encoded target - """ - - def __init__(self, x, y): - self.x = x - self.y = y - - def __len__(self): - return self.x.shape[0] - - def __getitem__(self, index): - x = torch.from_numpy(self.x[index].toarray()[0]).float() - y = self.y[index] - return x, y - - -class PredictDataset(Dataset): - """ - Format for numpy array - - Parameters - ---------- - X : 2D array - The input matrix - """ - - def __init__(self, x): - self.x = x - - def __len__(self): - return len(self.x) - - def __getitem__(self, index): - x = self.x[index] - return x - - -class SparsePredictDataset(Dataset): - """ - Format for csr_matrix - - Parameters - ---------- - X : CSR matrix - The input matrix - """ - - def __init__(self, x): - self.x = x - - def __len__(self): - return self.x.shape[0] - - def __getitem__(self, index): - x = torch.from_numpy(self.x[index].toarray()[0]).float() - return x - - -def create_sampler(weights, y_train): - """ - This creates a sampler from the given weights - - Parameters - ---------- - weights : either 0, 1, dict or iterable - if 0 (default) : no weights will be applied - if 1 : classification only, will balanced class with inverse frequency - if dict : keys are corresponding class values are sample weights - if iterable : list or np array must be of length equal to nb elements - in the training set - y_train : np.array - Training targets - """ - if isinstance(weights, int): - if weights == 0: - need_shuffle = True - sampler = None - elif weights == 1: - need_shuffle = False - class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)]) - - weights = 1.0 / class_sample_count - - samples_weight = np.array([weights[t] for t in y_train]) - - samples_weight = torch.from_numpy(samples_weight) - samples_weight = samples_weight.double() - sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) - else: - raise ValueError("Weights should be either 0, 1, dictionnary or list.") - elif isinstance(weights, dict): - # custom weights per class - need_shuffle = False - samples_weight = np.array([weights[t] for t in y_train]) - sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) - else: - # custom weights - if len(weights) != len(y_train): - raise ValueError("Custom weights should match number of train samples.") - need_shuffle = False - samples_weight = np.array(weights) - sampler = WeightedRandomSampler(samples_weight, len(samples_weight)) - return need_shuffle, sampler - - -def create_dataloaders(X_train, y_train, eval_set, weights, batch_size, num_workers, drop_last, pin_memory): - """ - Create dataloaders with or without subsampling depending on weights and balanced. - - Parameters - ---------- - X_train : np.ndarray - Training data - y_train : np.array - Mapped Training targets - eval_set : list of tuple - List of eval tuple set (X, y) - weights : either 0, 1, dict or iterable - if 0 (default) : no weights will be applied - if 1 : classification only, will balanced class with inverse frequency - if dict : keys are corresponding class values are sample weights - if iterable : list or np array must be of length equal to nb elements - in the training set - batch_size : int - how many samples per batch to load - num_workers : int - how many subprocesses to use for data loading. 0 means that the data - will be loaded in the main process - drop_last : bool - set to True to drop the last incomplete batch, if the dataset size is not - divisible by the batch size. If False and the size of dataset is not - divisible by the batch size, then the last batch will be smaller - pin_memory : bool - Whether to pin GPU memory during training - - Returns - ------- - train_dataloader, valid_dataloader : torch.DataLoader, torch.DataLoader - Training and validation dataloaders - """ - need_shuffle, sampler = create_sampler(weights, y_train) - - if scipy.sparse.issparse(X_train): - train_dataloader = DataLoader( - SparseTorchDataset(X_train.astype(np.float32), y_train), - batch_size=batch_size, - sampler=sampler, - shuffle=need_shuffle, - num_workers=num_workers, - drop_last=drop_last, - pin_memory=pin_memory, - ) - else: - train_dataloader = DataLoader( - TorchDataset(X_train.astype(np.float32), y_train), - batch_size=batch_size, - sampler=sampler, - shuffle=need_shuffle, - num_workers=num_workers, - drop_last=drop_last, - pin_memory=pin_memory, - ) - - valid_dataloaders = [] - for X, y in eval_set: - if scipy.sparse.issparse(X): - valid_dataloaders.append( - DataLoader( - SparseTorchDataset(X.astype(np.float32), y), - batch_size=batch_size, - shuffle=False, - num_workers=num_workers, - pin_memory=pin_memory, - ) - ) - else: - valid_dataloaders.append( - DataLoader( - TorchDataset(X.astype(np.float32), y), - batch_size=batch_size, - shuffle=False, - num_workers=num_workers, - pin_memory=pin_memory, - ) - ) - - return train_dataloader, valid_dataloaders - - -def create_explain_matrix(input_dim, cat_emb_dim, cat_idxs, post_embed_dim): - """ - This is a computational trick. - In order to rapidly sum importances from same embeddings - to the initial index. - - Parameters - ---------- - input_dim : int - Initial input dim - cat_emb_dim : int or list of int - if int : size of embedding for all categorical feature - if list of int : size of embedding for each categorical feature - cat_idxs : list of int - Initial position of categorical features - post_embed_dim : int - Post embedding inputs dimension - - Returns - ------- - reducing_matrix : np.array - Matrix of dim (post_embed_dim, input_dim) to performe reduce - """ - - if isinstance(cat_emb_dim, int): - all_emb_impact = [cat_emb_dim - 1] * len(cat_idxs) - else: - all_emb_impact = [emb_dim - 1 for emb_dim in cat_emb_dim] - - acc_emb = 0 - nb_emb = 0 - indices_trick = [] - for i in range(input_dim): - if i not in cat_idxs: - indices_trick.append([i + acc_emb]) - else: - indices_trick.append(range(i + acc_emb, i + acc_emb + all_emb_impact[nb_emb] + 1)) - acc_emb += all_emb_impact[nb_emb] - nb_emb += 1 - - reducing_matrix = np.zeros((post_embed_dim, input_dim)) - for i, cols in enumerate(indices_trick): - reducing_matrix[cols, i] = 1 - - return scipy.sparse.csc_matrix(reducing_matrix) - - -def create_group_matrix(list_groups, input_dim): - """ - Create the group matrix corresponding to the given list_groups - - Parameters - ---------- - - list_groups : list of list of int - Each element is a list representing features in the same group. - One feature should appear in maximum one group. - Feature that don't get assigned a group will be in their own group of one feature. - - input_dim : number of feature in the initial dataset - - Returns - ------- - - group_matrix : torch matrix - A matrix of size (n_groups, input_dim) - where m_ij represents the importance of feature j in group i - The rows must some to 1 as each group is equally important a priori. - - """ - check_list_groups(list_groups, input_dim) - - if len(list_groups) == 0: - group_matrix = torch.eye(input_dim) - return group_matrix - else: - n_groups = input_dim - int(np.sum([len(gp) - 1 for gp in list_groups])) - group_matrix = torch.zeros((n_groups, input_dim)) - - remaining_features = [feat_idx for feat_idx in range(input_dim)] - - current_group_idx = 0 - for group in list_groups: - group_size = len(group) - for elem_idx in group: - # add importrance of element in group matrix and corresponding group - group_matrix[current_group_idx, elem_idx] = 1 / group_size - # remove features from list of features - remaining_features.remove(elem_idx) - # move to next group - current_group_idx += 1 - # features not mentionned in list_groups get assigned their own group of singleton - for remaining_feat_idx in remaining_features: - group_matrix[current_group_idx, remaining_feat_idx] = 1 - current_group_idx += 1 - return group_matrix - - -def check_list_groups(list_groups, input_dim): - """ - Check that list groups: - - is a list of list - - does not contain twice the same feature in different groups - - does not contain unknown features (>= input_dim) - - does not contain empty groups - Parameters - ---------- - - list_groups : list of list of int - Each element is a list representing features in the same group. - One feature should appear in maximum one group. - Feature that don't get assign a group will be in their own group of one feature. - - input_dim : number of feature in the initial dataset - """ - assert isinstance(list_groups, list), "list_groups must be a list of list." - - if len(list_groups) == 0: - return - else: - for group_pos, group in enumerate(list_groups): - msg = f"Groups must be given as a list of list, but found {group} in position {group_pos}." # noqa - assert isinstance(group, list), msg - assert len(group) > 0, "Empty groups are forbidding please remove empty groups []" - - n_elements_in_groups = np.sum([len(group) for group in list_groups]) - flat_list = [] - for group in list_groups: - flat_list.extend(group) - unique_elements = np.unique(flat_list) - n_unique_elements_in_groups = len(unique_elements) - msg = f"One feature can only appear in one group, please check your grouped_features." - assert n_unique_elements_in_groups == n_elements_in_groups, msg - - highest_feat = np.max(unique_elements) - assert highest_feat < input_dim, f"Number of features is {input_dim} but one group contains {highest_feat}." # noqa - return - - -def filter_weights(weights): - """ - This function makes sure that weights are in correct format for - regression and multitask TabNet - - Parameters - ---------- - weights : int, dict or list - Initial weights parameters given by user - - Returns - ------- - None : This function will only throw an error if format is wrong - """ - err_msg = """Please provide a list or np.array of weights for """ - err_msg += """regression, multitask or pretraining: """ - if isinstance(weights, int): - if weights == 1: - raise ValueError(err_msg + "1 given.") - if isinstance(weights, dict): - raise ValueError(err_msg + "Dict given.") - return - - -def validate_eval_set(eval_set, eval_name, X_train, y_train): - """Check if the shapes of eval_set are compatible with (X_train, y_train). - - Parameters - ---------- - eval_set : list of tuple - List of eval tuple set (X, y). - The last one is used for early stopping - eval_name : list of str - List of eval set names. - X_train : np.ndarray - Train owned products - y_train : np.array - Train targeted products - - Returns - ------- - eval_names : list of str - Validated list of eval_names. - eval_set : list of tuple - Validated list of eval_set. - - """ - eval_name = eval_name or [f"val_{i}" for i in range(len(eval_set))] - - assert len(eval_set) == len(eval_name), "eval_set and eval_name have not the same length" - if len(eval_set) > 0: - assert all(len(elem) == 2 for elem in eval_set), "Each tuple of eval_set need to have two elements" - for name, (X, y) in zip(eval_name, eval_set): - check_input(X) - msg = f"Dimension mismatch between X_{name} " + f"{X.shape} and X_train {X_train.shape}" - assert len(X.shape) == len(X_train.shape), msg - - msg = f"Dimension mismatch between y_{name} " + f"{y.shape} and y_train {y_train.shape}" - assert len(y.shape) == len(y_train.shape), msg - - msg = f"Number of columns is different between X_{name} " + f"({X.shape[1]}) and X_train ({X_train.shape[1]})" - assert X.shape[1] == X_train.shape[1], msg - - if len(y_train.shape) == 2: - msg = ( - f"Number of columns is different between y_{name} " + f"({y.shape[1]}) and y_train ({y_train.shape[1]})" - ) - assert y.shape[1] == y_train.shape[1], msg - msg = f"You need the same number of rows between X_{name} " + f"({X.shape[0]}) and y_{name} ({y.shape[0]})" - assert X.shape[0] == y.shape[0], msg - - return eval_name, eval_set - - -def define_device(device_name): - """ - Define the device to use during training and inference. - If auto it will detect automatically whether to use cuda or cpu - - Parameters - ---------- - device_name : str - Either "auto", "cpu" or "cuda" - - Returns - ------- - str - Either "cpu" or "cuda" - """ - if device_name == "auto": - if torch.cuda.is_available(): - return "cuda" - else: - return "cpu" - elif device_name == "cuda" and not torch.cuda.is_available(): - return "cpu" - else: - return device_name - - -class ComplexEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, (np.generic, np.ndarray)): - return obj.tolist() - # Let the base class default method raise the TypeError - return json.JSONEncoder.default(self, obj) - - -def check_input(X): - """ - Raise a clear error if X is a pandas dataframe - and check array according to scikit rules - """ - if isinstance(X, (pd.DataFrame, pd.Series)): - err_message = "Pandas DataFrame are not supported: apply X.values when calling fit" - raise TypeError(err_message) - check_array(X, accept_sparse=True) - - -def check_warm_start(warm_start, from_unsupervised): - """ - Gives a warning about ambiguous usage of the two parameters. - """ - if warm_start and from_unsupervised is not None: - warn_msg = "warm_start=True and from_unsupervised != None: " - warn_msg = "warm_start will be ignore, training will start from unsupervised weights" - warnings.warn(warn_msg) - return - - -def check_embedding_parameters(cat_dims, cat_idxs, cat_emb_dim): - """ - Check parameters related to embeddings and rearrange them in a unique manner. - """ - if (cat_dims == []) ^ (cat_idxs == []): - if cat_dims == []: - msg = "If cat_idxs is non-empty, cat_dims must be defined as a list of same length." - else: - msg = "If cat_dims is non-empty, cat_idxs must be defined as a list of same length." - raise ValueError(msg) - elif len(cat_dims) != len(cat_idxs): - msg = "The lists cat_dims and cat_idxs must have the same length." - raise ValueError(msg) - - if isinstance(cat_emb_dim, int): - cat_emb_dims = [cat_emb_dim] * len(cat_idxs) - else: - cat_emb_dims = cat_emb_dim - - # check that all embeddings are provided - if len(cat_emb_dims) != len(cat_dims): - msg = f"""cat_emb_dim and cat_dims must be lists of same length, got {len(cat_emb_dims)} - and {len(cat_dims)}""" - raise ValueError(msg) - - # Rearrange to get reproducible seeds with different ordering - if len(cat_idxs) > 0: - sorted_idxs = np.argsort(cat_idxs) - cat_dims = [cat_dims[i] for i in sorted_idxs] - cat_emb_dims = [cat_emb_dims[i] for i in sorted_idxs] - - return cat_dims, cat_idxs, cat_emb_dims diff --git a/lightautoml/text/embed.py b/lightautoml/text/embed.py index fb22b91e..e8e46afa 100644 --- a/lightautoml/text/embed.py +++ b/lightautoml/text/embed.py @@ -530,6 +530,7 @@ def __init__(self, n: int, d_in: int, d_out: int) -> None: self.layers = nn.ModuleList([nn.Linear(d_in, d_out) for _ in range(n)]) def forward(self, x): + """Forward-pass.""" return torch.stack([l(x[:, i]) for i, l in enumerate(self.layers)], 1) @@ -570,6 +571,7 @@ def get_out_shape(self) -> int: return self.n_features def forward(self, x: Tensor) -> Tensor: + """Forward-pass.""" x = self._cos_sin(2 * torch.pi * self.coefficients[None] * x[..., None]) if self.flatten_output: return x.view(x.shape[0], -1) @@ -642,23 +644,20 @@ def __init__(self, *args, **kwargs): class SoftEmbedding(torch.nn.Module): - """ - Soft-one hot encoding embedding technique, from https://arxiv.org/pdf/1708.00065.pdf - In a nutshell, it represents a continuous feature as a weighted average of embeddings - """ + """Soft-one hot encoding embedding technique, from https://arxiv.org/pdf/1708.00065.pdf. - def __init__(self, num_dims, embedding_size=10, flatten_output: bool = False, **kwargs) -> None: - """ + In a nutshell, it represents a continuous feature as a weighted average of embeddings - Parameters - ---------- + Args: num_embeddings: Number of embeddings to use (cardinality of the embedding table). embeddings_dim: The dimension of the vector space for projecting the scalar value. embeddings_init_std: The standard deviation factor for normal initialization of the embedding matrix weights. emb_initializer: Dict where keys are feature names and values are callable to initialize embedding tables - """ + """ + + def __init__(self, num_dims, embedding_size=10, flatten_output: bool = False, **kwargs) -> None: super(SoftEmbedding, self).__init__() self.embedding_table = torch.nn.Embedding(num_dims, embedding_size) nn.init.xavier_uniform_(self.embedding_table.weight) From 2557c4c82a53e7559d442742629c39ca6f31da5d Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Fri, 1 Sep 2023 13:48:45 +0000 Subject: [PATCH 30/49] bugfix --- lightautoml/ml_algo/torch_based/nn_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index cee78575..17f8afff 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -8,7 +8,7 @@ import numpy as np import torch import torch.nn as nn -from lightautoml.ml_algo.tabnet.utils import TabNetEncoder, initialize_non_glu +from lightautoml.ml_algo.tabnet.utils import TabNetEncoder, _initialize_non_glu from lightautoml.ml_algo.torch_based.autoint.autoint_utils import AttnInteractionBlock, LeakyGate from lightautoml.ml_algo.torch_based.autoint.ghost_norm import GhostBatchNorm @@ -1063,11 +1063,11 @@ def __init__( self.multi_task_mappings = torch.nn.ModuleList() for task_dim in n_out: task_mapping = nn.Linear(n_d, task_dim, bias=False) - initialize_non_glu(task_mapping, n_d, task_dim) + _initialize_non_glu(task_mapping, n_d, task_dim) self.multi_task_mappings.append(task_mapping) else: self.final_mapping = nn.Linear(n_d, n_out, bias=False) - initialize_non_glu(self.final_mapping, n_d, n_out) + _initialize_non_glu(self.final_mapping, n_d, n_out) def forward(self, x): """Forward-pass.""" From 42fd85fe9a3298f468c28712cbb67b388fc6041b Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Fri, 1 Sep 2023 13:55:32 +0000 Subject: [PATCH 31/49] changed import links --- lightautoml/ml_algo/torch_based/nn_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index 30676df8..99914e32 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -8,10 +8,10 @@ import numpy as np import torch import torch.nn as nn -from lightautoml.ml_algo.torch_based.autoint.autoint_utils import AttnInteractionBlock, LeakyGate -from lightautoml.ml_algo.torch_based.autoint.ghost_norm import GhostBatchNorm +from .autoint.autoint_utils import AttnInteractionBlock, LeakyGate +from .autoint.ghost_norm import GhostBatchNorm -from lightautoml.ml_algo.torch_based.node_nn_model import DenseODSTBlock, MeanPooling +from .node_nn_model import DenseODSTBlock, MeanPooling class GaussianNoise(nn.Module): From 7a8bf65debc1d30cd7b1bd0f979230cd5caa2f46 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Wed, 6 Sep 2023 13:11:28 +0000 Subject: [PATCH 32/49] changed import links --- lightautoml/ml_algo/tabnet/utils.py | 4 ++-- lightautoml/ml_algo/torch_based/nn_models.py | 21 +++----------------- lightautoml/ml_algo/tuning/base.py | 6 +++--- lightautoml/ml_algo/tuning/optuna.py | 14 ++++++------- 4 files changed, 15 insertions(+), 30 deletions(-) diff --git a/lightautoml/ml_algo/tabnet/utils.py b/lightautoml/ml_algo/tabnet/utils.py index a901b7c6..8530be5d 100644 --- a/lightautoml/ml_algo/tabnet/utils.py +++ b/lightautoml/ml_algo/tabnet/utils.py @@ -2,8 +2,8 @@ import torch import numpy as np import torch.nn as nn -from lightautoml.ml_algo.torch_based.node_nn_model import Entmax15, Sparsemax -from lightautoml.ml_algo.torch_based.autoint.ghost_norm import GhostBatchNorm +from ..torch_based.node_nn_model import Entmax15, Sparsemax +from ..torch_based.autoint.ghost_norm import GhostBatchNorm def _initialize_non_glu(module, input_dim, output_dim): diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index 56813004..f380ef38 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -1143,7 +1143,6 @@ def __init__( super(TabNet, self).__init__() self.input_dim = n_in self.output_dim = n_out - self.is_multi_task = isinstance(n_out, list) self.n_d = n_d self.n_a = n_a self.n_steps = n_steps @@ -1171,29 +1170,15 @@ def __init__( group_attention_matrix=group_attention_matrix, ) - if self.is_multi_task: - self.multi_task_mappings = torch.nn.ModuleList() - for task_dim in n_out: - task_mapping = nn.Linear(n_d, task_dim, bias=False) - _initialize_non_glu(task_mapping, n_d, task_dim) - self.multi_task_mappings.append(task_mapping) - else: - self.final_mapping = nn.Linear(n_d, n_out, bias=False) - _initialize_non_glu(self.final_mapping, n_d, n_out) + self.final_mapping = nn.Linear(n_d, n_out, bias=True) + _initialize_non_glu(self.final_mapping, n_d, n_out) def forward(self, x): """Forward-pass.""" res = 0 steps_output, M_loss = self.encoder(x) res = torch.sum(torch.stack(steps_output, dim=0), dim=0) - - if self.is_multi_task: - # Result will be in list format - out = [] - for task_mapping in self.multi_task_mappings: - out.append(task_mapping(res)) - else: - out = self.final_mapping(res) + out = self.final_mapping(res) return out def forward_masks(self, x): diff --git a/lightautoml/ml_algo/tuning/base.py b/lightautoml/ml_algo/tuning/base.py index 5c1a803e..692c5080 100644 --- a/lightautoml/ml_algo/tuning/base.py +++ b/lightautoml/ml_algo/tuning/base.py @@ -7,11 +7,11 @@ from typing import Tuple from typing import overload -from lightautoml.dataset.base import LAMLDataset +from ...dataset.base import LAMLDataset # if TYPE_CHECKING: -from lightautoml.ml_algo.base import MLAlgo -from lightautoml.validation.base import TrainValidIterator +from ...ml_algo.base import MLAlgo +from ...validation.base import TrainValidIterator class DistributionBase(ABC): diff --git a/lightautoml/ml_algo/tuning/optuna.py b/lightautoml/ml_algo/tuning/optuna.py index 3e86e4dd..eade5d12 100644 --- a/lightautoml/ml_algo/tuning/optuna.py +++ b/lightautoml/ml_algo/tuning/optuna.py @@ -12,13 +12,13 @@ import optuna -from lightautoml.dataset.base import LAMLDataset -from lightautoml.ml_algo.base import MLAlgo -from lightautoml.ml_algo.tuning.base import Choice -from lightautoml.ml_algo.tuning.base import ParamsTuner -from lightautoml.ml_algo.tuning.base import Uniform -from lightautoml.validation.base import HoldoutIterator -from lightautoml.validation.base import TrainValidIterator +from ...dataset.base import LAMLDataset +from ..base import MLAlgo +from .base import Choice +from .base import ParamsTuner +from .base import Uniform +from ...validation.base import HoldoutIterator +from ...validation.base import TrainValidIterator logger = logging.getLogger(__name__) From 94fdd763ac2d88feef9780201977521c7c09147c Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Wed, 6 Sep 2023 14:41:19 +0000 Subject: [PATCH 33/49] bugfix --- lightautoml/ml_algo/dl_model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 010a4a28..bff1eedc 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -125,8 +125,6 @@ "soft": SoftEmbeddingFlat, } cont_embedder_by_name = {"linear": LinearEmbedding, "dense": DenseEmbedding, "plr": PLREmbedding, "soft": SoftEmbedding} -cont_embedder_by_name_flat = {"cont": ContEmbedder, "linear": LinearEmbeddingFlat, "dense": DenseEmbeddingFlat} -cont_embedder_by_name = {"linear": LinearEmbedding, "dense": DenseEmbedding} class TorchModel(TabularMLAlgo): From 1c4170e8d1d5d75eae1671393d7de2bd711422e8 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Thu, 7 Sep 2023 12:52:28 +0000 Subject: [PATCH 34/49] some new changes --- lightautoml/ml_algo/base.py | 4 +- lightautoml/ml_algo/torch_based/nn_models.py | 15 +++--- lightautoml/text/embed.py | 3 +- lightautoml/text/nn_model.py | 57 +++++++++++++++----- 4 files changed, 57 insertions(+), 22 deletions(-) diff --git a/lightautoml/ml_algo/base.py b/lightautoml/ml_algo/base.py index 904b1e0a..0dec5aba 100755 --- a/lightautoml/ml_algo/base.py +++ b/lightautoml/ml_algo/base.py @@ -240,7 +240,9 @@ def fit_predict(self, train_valid_iterator: TrainValidIterator) -> NumpyDataset: iterator_len = len(train_valid_iterator) if iterator_len > 1: logger.info("Start fitting \x1b[1m{}\x1b[0m ...".format(self._name)) - logger.debug(f"Training params: {self.params}") + stop_params = ["cat_features", "cont_features", "cat_dims", "cat_vc"] + printable_params = {key: value for key, value in self.params.items() if key not in stop_params} + logger.debug(f"Training params: {printable_params}") # save features names self._features = train_valid_iterator.features diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index f380ef38..485d0ac4 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -154,8 +154,8 @@ def __init__( dropout_first: bool = True, bn_momentum: float = 0.1, ghost_batch: Optional[int] = 64, - leaky_gate: bool = True, use_skip: bool = True, + leaky_gate: bool = True, weighted_sum: bool = True, device: torch.device = torch.device("cuda:0"), **kwargs, @@ -180,7 +180,7 @@ def __init__( self.features.add_module("dense0", nn.Linear(n_in, num_features)) if leaky_gate: - self.features.add_module("leakygate0", LeakyGate(n_in)) + self.features.add_module("leakygate0", LeakyGate(num_features)) if dropout_first and drop_rate[0] > 0: self.features.add_module("dropout0", nn.Dropout(drop_rate[0])) @@ -228,7 +228,7 @@ def forward(self, X: torch.Tensor) -> torch.Tensor: x = X input = x.detach().clone() for name, layer in self.features.named_children(): - if name != "denseblock1" and name != "dense0" and self.concat_input: + if name not in ["dropout0", "leakygate0", "denseblock1", "dense0"] and self.concat_input: x = torch.cat([x, input], 1) x = layer(x) out = self.fc(x) @@ -976,6 +976,7 @@ def __init__( use_skip=mlp_use_skip, device=device, ) + self.use_skip = True if weighted_sum: self.mix = nn.Parameter(torch.tensor([0.0], device=device)) else: @@ -1127,16 +1128,16 @@ def __init__( self, n_in, n_out, - n_d=8, - n_a=8, - n_steps=3, + n_d=32, + n_a=32, + n_steps=1, gamma=1.3, n_independent=2, n_shared=2, epsilon=1e-15, virtual_batch_size=128, momentum=0.02, - mask_type="sparsemax", + mask_type="entemax", group_attention_matrix=None, **kwargs, ): diff --git a/lightautoml/text/embed.py b/lightautoml/text/embed.py index e8e46afa..0fbe062d 100644 --- a/lightautoml/text/embed.py +++ b/lightautoml/text/embed.py @@ -12,6 +12,7 @@ import torch.nn as nn from torch import Tensor import operator +import numpy as np try: from transformers import AutoModel @@ -572,7 +573,7 @@ def get_out_shape(self) -> int: def forward(self, x: Tensor) -> Tensor: """Forward-pass.""" - x = self._cos_sin(2 * torch.pi * self.coefficients[None] * x[..., None]) + x = self._cos_sin(2 * np.pi * self.coefficients[None] * x[..., None]) if self.flatten_output: return x.view(x.shape[0], -1) return x diff --git a/lightautoml/text/nn_model.py b/lightautoml/text/nn_model.py index dc4db2ae..58fa1574 100644 --- a/lightautoml/text/nn_model.py +++ b/lightautoml/text/nn_model.py @@ -162,13 +162,34 @@ def __init__( ) if bias is not None: - try: - last_layer = list( - filter( - lambda x: isinstance(x, nn.Linear) or isinstance(x, nn.Sequential), - list(self.torch_model.children()), - ) - )[-1] + self._set_last_layer(self.torch_model, bias) + + self.сlump = Clump() + self.sig = nn.Sigmoid() + self.softmax = nn.Softmax(dim=1) + + def _set_last_layer(self, torch_model, bias): + try: + use_skip = torch_model.use_skip + self._init_last_layers(torch_model, bias, use_skip) + except: + self._init_last_layers(torch_model, bias, False) + + def _init_last_layers(self, torch_model, bias, use_skip=False): + try: + all_layers = list(torch_model.children()) + layers = list( + filter( + lambda x: isinstance(x, nn.Linear) or isinstance(x, nn.Sequential), + all_layers, + ) + ) + if len(layers) == 0: + last_layer = all_layers[-1] + self._set_last_layer(last_layer, bias) + + else: + last_layer = layers[-1] while isinstance(last_layer, nn.Sequential): last_layer = list( filter(lambda x: isinstance(x, nn.Linear) or isinstance(x, nn.Sequential), last_layer) @@ -177,12 +198,22 @@ def __init__( last_layer.bias.data = bias shape = last_layer.weight.data.shape last_layer.weight.data = torch.zeros(shape[0], shape[1], requires_grad=True) - except: - logger.info3("Last linear layer not founded, so init_bias=False") - - self.сlump = Clump() - self.sig = nn.Sigmoid() - self.softmax = nn.Softmax(dim=1) + if use_skip: + if len(layers) <= 1: + last_layer = all_layers[-2] + self._set_last_layer(last_layer, bias) + else: + pre_last_layer = layers[-2] + while isinstance(last_layer, nn.Sequential): + pre_last_layer = list( + filter(lambda x: isinstance(x, nn.Linear) or isinstance(x, nn.Sequential), pre_last_layer) + )[-1] + bias = torch.Tensor(bias) + pre_last_layer.bias.data = bias + shape = pre_last_layer.weight.data.shape + pre_last_layer.weight.data = torch.zeros(shape[0], shape[1], requires_grad=True) + except: + logger.info3("Last linear layer not founded, so init_bias=False") def get_logits(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: """Forward-pass of model with embeddings.""" From 203511350ed81aae9b4f28c7829dd35a194f9417 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Thu, 7 Sep 2023 13:06:20 +0000 Subject: [PATCH 35/49] now we dont count VC for cat features for every embedding --- lightautoml/ml_algo/dl_model.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index bff1eedc..e8051613 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -421,11 +421,12 @@ def _init_params_on_input(self, train_valid_iterator) -> dict: ) + 1 ) - values, counts = np.unique( - np.concatenate([train_valid_iterator.train[:, cat_feature].data, valid[:, cat_feature].data]), - return_counts=True, - ) - cat_value_counts.append(dict(zip(values, counts))) + if params["cat_embedder"] == "weighted": + values, counts = np.unique( + np.concatenate([train_valid_iterator.train[:, cat_feature].data, valid[:, cat_feature].data]), + return_counts=True, + ) + cat_value_counts.append(dict(zip(values, counts))) cat_dims.append(num_unique_categories) new_params["cat_dims"] = cat_dims new_params["cat_vc"] = cat_value_counts From 0afe07f7e92076e541a5bfaff5101c1bbcff59a4 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Fri, 8 Sep 2023 14:28:01 +0000 Subject: [PATCH 36/49] no embedder bugfix --- lightautoml/ml_algo/dl_model.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index e8051613..0a207ce1 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -115,7 +115,7 @@ "cat_no_dropout": BasicCatEmbeddingFlat, "weighted": WeightedCatEmbeddingFlat, } -cat_embedder_by_name = {"cat_no_dropout": BasicCatEmbedding, "weighted": WeightedCatEmbedding} +cat_embedder_by_name = {"cat": BasicCatEmbedding, "cat_no_dropout": BasicCatEmbedding, "weighted": WeightedCatEmbedding} cont_embedder_by_name_flat = { "cont": ContEmbedder, @@ -124,7 +124,13 @@ "plr": PLREmbeddingFlat, "soft": SoftEmbeddingFlat, } -cont_embedder_by_name = {"linear": LinearEmbedding, "dense": DenseEmbedding, "plr": PLREmbedding, "soft": SoftEmbedding} +cont_embedder_by_name = { + "cont": LinearEmbedding, + "linear": LinearEmbedding, + "dense": DenseEmbedding, + "plr": PLREmbedding, + "soft": SoftEmbedding, +} class TorchModel(TabularMLAlgo): @@ -299,7 +305,7 @@ def _infer_params(self): net_params={ "task": self.task, "cont_embedder_": cont_embedder_by_name.get(params["cont_embedder"], LinearEmbedding) - if input_type_by_name[params["model"]] == "seq" + if input_type_by_name[params["model"]] == "seq" and is_cont else cont_embedder_by_name_flat.get(params["cont_embedder"], ContEmbedder) if is_cont else None, @@ -312,7 +318,7 @@ def _infer_params(self): if is_cont else None, "cat_embedder_": cat_embedder_by_name.get(params["cat_embedder"], BasicCatEmbedding) - if input_type_by_name[params["model"]] == "seq" + if input_type_by_name[params["model"]] == "seq" and is_cat else cat_embedder_by_name_flat.get(params["cat_embedder"], CatEmbedder) if is_cat else None, From 39beb9ec0ff92c2bb7036b844d017e44e0729453 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Fri, 8 Sep 2023 15:58:32 +0000 Subject: [PATCH 37/49] scheduler params --- lightautoml/automl/presets/base.py | 3 ++- lightautoml/automl/presets/tabular_config.yml | 2 +- lightautoml/ml_algo/dl_model.py | 6 +++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lightautoml/automl/presets/base.py b/lightautoml/automl/presets/base.py index 01c3f01f..975358ef 100644 --- a/lightautoml/automl/presets/base.py +++ b/lightautoml/automl/presets/base.py @@ -37,8 +37,9 @@ def upd_params(old: dict, new: dict) -> dict: Updated parameters. """ + not_updatable_params = ["scheduler_params"] for k in new: - if type(new[k]) is dict and k in old and type(old[k]) is dict: + if type(new[k]) is dict and k in old and type(old[k]) is dict and k not in not_updatable_params: upd_params(old[k], new[k]) else: old[k] = new[k] diff --git a/lightautoml/automl/presets/tabular_config.yml b/lightautoml/automl/presets/tabular_config.yml index d391d5e8..cecb9a9d 100755 --- a/lightautoml/automl/presets/tabular_config.yml +++ b/lightautoml/automl/presets/tabular_config.yml @@ -184,7 +184,7 @@ nn_params: # scheduler sch: ReduceLROnPlateau # params of ReduceLROnPlateau scheduler - scheduler_params: {} #{ 'patience': 5, 'factor': 0.5, 'min_lr': 0.00001 } + scheduler_params: { 'patience': 5, 'factor': 0.5, 'min_lr': 0.00001 } # using snapshot ensembles # https://arxiv.org/abs/1704.00109 is_snap: false diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 0a207ce1..4fe7240c 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -115,7 +115,11 @@ "cat_no_dropout": BasicCatEmbeddingFlat, "weighted": WeightedCatEmbeddingFlat, } -cat_embedder_by_name = {"cat": BasicCatEmbedding, "cat_no_dropout": BasicCatEmbedding, "weighted": WeightedCatEmbedding} +cat_embedder_by_name = { + "cat_no_dropout": BasicCatEmbedding, + "cat_no_dropout": BasicCatEmbedding, + "weighted": WeightedCatEmbedding, +} cont_embedder_by_name_flat = { "cont": ContEmbedder, From 6294e9f51ed38bcb2a79f5cf0056ef98ab6637c2 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Mon, 11 Sep 2023 09:29:53 +0000 Subject: [PATCH 38/49] bfixs --- lightautoml/ml_algo/dl_model.py | 24 ++++++++++--------- lightautoml/ml_algo/torch_based/nn_models.py | 9 ++++--- .../ml_algo/torch_based/node_nn_model.py | 14 ++++++++++- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 4fe7240c..4338659d 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -314,10 +314,11 @@ def _infer_params(self): if is_cont else None, "cont_params": { - "num_dims": params["num_dims"], - "input_bn": params["input_bn"], - "device": params["device"], - "embedding_size": params["embedding_size"], + # "num_dims": params["num_dims"], + # "input_bn": params["input_bn"], + # "device": params["device"], + # "embedding_size": params["embedding_size"], + **params } if is_cont else None, @@ -327,13 +328,14 @@ def _infer_params(self): if is_cat else None, "cat_params": { - "cat_vc": params["cat_vc"], - "cat_dims": params["cat_dims"], - "emb_dropout": params["emb_dropout"], - "emb_ratio": params["emb_ratio"], - "max_emb_size": params["max_emb_size"], - "embedding_size": params["embedding_size"], - "device": params["device"], + # "cat_vc": params["cat_vc"], + # "cat_dims": params["cat_dims"], + # "emb_dropout": params["emb_dropout"], + # "emb_ratio": params["emb_ratio"], + # "max_emb_size": params["max_emb_size"], + # "embedding_size": params["embedding_size"], + # "device": params["device"], + **params } if is_cat else None, diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index 485d0ac4..752b1b77 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -153,9 +153,9 @@ def __init__( concat_input: bool = True, dropout_first: bool = True, bn_momentum: float = 0.1, - ghost_batch: Optional[int] = 64, - use_skip: bool = True, - leaky_gate: bool = True, + ghost_batch: Optional[int] = None, + use_skip: bool = False, + leaky_gate: bool = False, weighted_sum: bool = True, device: torch.device = torch.device("cuda:0"), **kwargs, @@ -828,6 +828,7 @@ class NODE(nn.Module): layer_dim: num trees in one layer. num_layers: number of forests. tree_dim: number of response channels in the response of individual tree. + choice_function: str `entmax` or `sparsmax` use_original_head use averaging as a head or put linear layer instead. depth: number of splits in every tree. drop_rate: Dropout rate for each layer altogether. @@ -843,6 +844,7 @@ def __init__( layer_dim: int = 2048, num_layers: int = 1, tree_dim: int = 1, + choice_function="entmax", use_original_head: bool = False, depth: int = 6, drop_rate: float = 0.0, @@ -861,6 +863,7 @@ def __init__( num_layers=num_layers, tree_dim=tree_dim if not use_original_head else n_out, depth=depth, + choice_function=choice_function, input_dropout=drop_rate, flatten_output=not use_original_head, ) diff --git a/lightautoml/ml_algo/torch_based/node_nn_model.py b/lightautoml/ml_algo/torch_based/node_nn_model.py index e57f5125..e3f3f6da 100644 --- a/lightautoml/ml_algo/torch_based/node_nn_model.py +++ b/lightautoml/ml_algo/torch_based/node_nn_model.py @@ -554,6 +554,7 @@ class DenseODSTBlock(nn.Sequential): max_features: maximum number of features per input depth: number of splits in every tree. input_dropout: Dropout rate forest layer. + choice_function: str `entmax` or `sparsmax`. flatten_output: flatten output or not. """ @@ -565,12 +566,23 @@ def __init__( tree_dim=1, max_features=None, input_dropout=0.0, + choice_function="entmax", flatten_output=True, **kwargs ): layers = [] + ch_f = Sparsemax() if choice_function == "sparsmax" else Entmax15() + bin_f = Sparsemoid() if choice_function == "sparsmax" else Entmoid15() for i in range(num_layers): - oddt = ODST(input_dim, layer_dim, tree_dim=tree_dim, flatten_output=True, **kwargs) + oddt = ODST( + input_dim, + layer_dim, + tree_dim=tree_dim, + flatten_output=True, + choice_function=ch_f, + bin_function=bin_f, + **kwargs + ) input_dim = min(input_dim + layer_dim * tree_dim, max_features or float("inf")) layers.append(oddt) From 81fab518d71ebc98f8fe181b6443ab89a72913f1 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Mon, 11 Sep 2023 09:35:55 +0000 Subject: [PATCH 39/49] bfixs --- lightautoml/text/nn_model.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lightautoml/text/nn_model.py b/lightautoml/text/nn_model.py index 58fa1574..3ad54bb9 100644 --- a/lightautoml/text/nn_model.py +++ b/lightautoml/text/nn_model.py @@ -169,11 +169,8 @@ def __init__( self.softmax = nn.Softmax(dim=1) def _set_last_layer(self, torch_model, bias): - try: - use_skip = torch_model.use_skip - self._init_last_layers(torch_model, bias, use_skip) - except: - self._init_last_layers(torch_model, bias, False) + use_skip = getattr(torch_model, "use_skip", False) + self._init_last_layers(torch_model, bias, use_skip) def _init_last_layers(self, torch_model, bias, use_skip=False): try: From 99d77f8eee459a4cd068955c820f62e023424fc5 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Tue, 12 Sep 2023 10:40:11 +0000 Subject: [PATCH 40/49] mlp embedder --- lightautoml/text/embed.py | 58 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/lightautoml/text/embed.py b/lightautoml/text/embed.py index 0fbe062d..fa0ea834 100644 --- a/lightautoml/text/embed.py +++ b/lightautoml/text/embed.py @@ -705,3 +705,61 @@ class SoftEmbeddingFlat(SoftEmbedding): def __init__(self, *args, **kwargs): super(SoftEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) + + +class MLPContEmbedding(nn.Module): + """MLP multi-dim embedding. + + Args: + num_dims : num of features. + d_in: input size. + d_out: output size. + d_hidden: hidden size. + """ + + def __init__( + self, + num_dims: int, + embedding_size: int = 10, + d_hidden: int = 64, + flatten_output: bool = False, + **kwargs, + ) -> None: + super().__init__() + self.flatten_output = flatten_output + self.embedding_size = embedding_size + self.num_dims = num_dims + self.layers = nn.ModuleList( + [ + nn.Sequential(nn.Linear(1, d_hidden), nn.ReLU(), nn.Linear(d_hidden, embedding_size)) + for _ in range(num_dims) + ] + ) + + def get_out_shape(self) -> int: + """Output shape. + + Returns: + int with module output shape. + + """ + if self.flatten_output: + return self.num_dims * self.embedding_size + else: + return self.num_dims + + def forward(self, X: Dict) -> Tensor: + """Produce embedding for each value in input. + + Args: + X : Dict + + Returns: + torch.Tensor + + """ + x = X["cont"] + x = torch.stack([l(x[:, i]) for i, l in enumerate(self.layers)], 1) + if self.flatten_output: + return x.view(x.shape[0], -1) + return x From 41e547f279ecbbcd389d98e3daeacfdcc9e798fd Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Wed, 20 Sep 2023 14:21:04 +0000 Subject: [PATCH 41/49] no descr --- lightautoml/automl/presets/tabular_presets.py | 1 + lightautoml/ml_algo/dl_model.py | 60 ++++-- lightautoml/ml_algo/torch_based/nn_models.py | 90 +++++++++ .../ml_algo/torch_based/saint/saint.py | 144 +++++++++++++++ lightautoml/text/nn_model.py | 6 +- lightautoml/text/trainer.py | 171 +++++++++++++++++- lightautoml/text/utils.py | 2 +- 7 files changed, 445 insertions(+), 29 deletions(-) create mode 100644 lightautoml/ml_algo/torch_based/saint/saint.py diff --git a/lightautoml/automl/presets/tabular_presets.py b/lightautoml/automl/presets/tabular_presets.py index 166cb653..4b46e1da 100755 --- a/lightautoml/automl/presets/tabular_presets.py +++ b/lightautoml/automl/presets/tabular_presets.py @@ -609,6 +609,7 @@ def create_automl(self, **fit_args): "autoint", "tabnet", "fttransformer", + "saint", ] available_nn_models = available_nn_models + [x + "_tuned" for x in available_nn_models] nn_models = [ diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 4338659d..38b1f521 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -1,6 +1,7 @@ """Neural net for tabular datasets.""" +from itertools import cycle from lightautoml.utils.installation import __validate_extra_deps @@ -73,6 +74,7 @@ from .torch_based.nn_models import MLP, TabNet from .torch_based.nn_models import NODE from .torch_based.nn_models import SNN +from .torch_based.nn_models import SAINT from .torch_based.nn_models import DenseLightModel from .torch_based.nn_models import DenseModel from .torch_based.nn_models import LinearLayer @@ -84,6 +86,8 @@ logger = logging.getLogger(__name__) +models_dependent_on_training_data = ["saint"] + model_by_name = { "denselight": DenseLightModel, "dense": DenseModel, @@ -96,7 +100,9 @@ "autoint": AutoInt, "tabnet": TabNet, "fttransformer": FTTransformer, + "saint":SAINT, } + input_type_by_name = { "denselight": "flat", "dense": "flat", @@ -109,6 +115,7 @@ "autoint": "seq", "tabnet": "flat", "fttransformer": "seq", + "saint": "seq", } cat_embedder_by_name_flat = { "cat": CatEmbedder, @@ -255,7 +262,7 @@ class TorchModel(TabularMLAlgo): **_default_models_params, } - def _infer_params(self): + def _infer_params(self, train = None): if self.params["path_to_save"] is not None: self.path_to_save = os.path.relpath(self.params["path_to_save"]) if not os.path.exists(self.path_to_save): @@ -304,6 +311,22 @@ def _infer_params(self): params[p_name] = getattr(module, params[p_name]) # params = self._select_params(params) + if params['model'] in models_dependent_on_training_data: + self.use_sampler = True + if train is not None: + self.train = train + else: + self.use_sampler = False + + self.train_params = { + "dataset": params["dataset"], + "bs": params["bs"], + "num_workers": params["num_workers"], + "pin_memory": params["pin_memory"], + "tokenizer": AutoTokenizer.from_pretrained(params["bert_name"], use_fast=False) if is_text else None, + "max_length": params["max_length"], + } + model = Trainer( net=TorchUniversalModel if not params["model_with_emb"] else params["model"], net_params={ @@ -349,18 +372,11 @@ def _infer_params(self): "torch_model": torch_model, **params, }, - **{"apex": False, **params}, + + **{"apex": False, + **params}, ) - self.train_params = { - "dataset": params["dataset"], - "bs": params["bs"], - "num_workers": params["num_workers"], - "pin_memory": params["pin_memory"], - "tokenizer": AutoTokenizer.from_pretrained(params["bert_name"], use_fast=False) if is_text else None, - "max_length": params["max_length"], - } - return model @staticmethod @@ -553,8 +569,8 @@ def fit_predict(self, train_valid_iterator: TrainValidIterator) -> NumpyDataset: self.params = self.init_params_on_input(train_valid_iterator) self.params = self._init_params_on_input(train_valid_iterator) return super().fit_predict(train_valid_iterator) - - def fit_predict_single_fold(self, train, valid): + + def fit_predict_single_fold(self, train: TabularDataset, valid: TabularDataset): """Implements training and prediction on single fold. Args: @@ -570,14 +586,17 @@ def fit_predict_single_fold(self, train, valid): target = train.target self.params["bias"] = self.get_mean_target(target, task_name) if self.params["init_bias"] else None - model = self._infer_params() + model = self._infer_params(train) model_path = ( os.path.join(self.path_to_save, f"{uuid.uuid4()}.pickle") if self.path_to_save is not None else None ) # init datasets - dataloaders = self.get_dataloaders_from_dicts({"train": train.to_pandas(), "val": valid.to_pandas()}) - + if self.use_sampler: + dataloaders = self.get_dataloaders_from_dicts({"train": train.to_pandas(), "val": valid.to_pandas(),"sampler": train.to_pandas()}) + else: + dataloaders = self.get_dataloaders_from_dicts({"train": train.to_pandas(), "val": valid.to_pandas()}) + dataloaders['sampler'] = None val_pred = model.fit(dataloaders) if model_path is None: @@ -603,12 +622,17 @@ def predict_single_fold(self, model: any, dataset: TabularDataset) -> np.ndarray """ seed_everything(self.params["random_state"], self.params["deterministic"]) - dataloaders = self.get_dataloaders_from_dicts({"test": dataset.to_pandas()}) + if self.use_sampler: + dataloaders = self.get_dataloaders_from_dicts({"test": dataset.to_pandas(),"sampler": self.train.to_pandas()}) + else: + dataloaders = self.get_dataloaders_from_dicts({"test": dataset.to_pandas()}) + dataloaders['sampler'] = None + if isinstance(model, (str, dict)): model = self._infer_params().load_state(model) - pred = model.predict(dataloaders["test"], "test") + pred = model.predict(dataloaders, "test") model.clean() del dataloaders, model diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index d1834506..f2368853 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -8,6 +8,8 @@ import numpy as np import torch import torch.nn as nn + +from .saint.saint import ColTransformer, RowColTransformer from ..tabnet.utils import TabNetEncoder, _initialize_non_glu from .autoint.autoint_utils import AttnInteractionBlock, LeakyGate from .autoint.ghost_norm import GhostBatchNorm @@ -1187,3 +1189,91 @@ def forward(self, x): def forward_masks(self, x): """Magic forward-pass of encoder that returns masks.""" return self.encoder.forward_masks(x) + + +class SAINT(nn.Module): + def __init__( + self, + n_in: int, + n_out: int = 1, + embedding_size: int = 10, + depth: int =2, + heads: int = 8, + dim_head = 16, + mlp_hidden_mults = (4, 2), + ffn_mult = 4, + attn_dropout = 0., + ff_dropout = 0., + mlp_dropout =0., + attentiontype = 'colrow', + device: torch.device = torch.device("cuda:0"), + **kwargs + ): + super().__init__() + self.device = device + self.cls_token = nn.Embedding(2, embedding_size) + self.attentiontype = attentiontype + if attentiontype == 'col': + self.transformer = ColTransformer( + dim = embedding_size, + depth = depth, + heads = heads, + dim_head = dim_head, + attn_dropout = attn_dropout, + ff_dropout = ff_dropout + ) + elif attentiontype in ['row','colrow'] : + self.transformer = RowColTransformer( + dim = embedding_size, + nfeats= n_in+1, #num featurs + depth = depth, + heads = heads, + dim_head = dim_head, + ffn_mult = ffn_mult, + attn_dropout = attn_dropout, + ff_dropout = ff_dropout, + style = attentiontype + ) + + l = (n_in+1) // 8 #input_size = (dim * self.num_categories) + (dim * num_continuous) + hidden_dimensions = list(map(lambda t: l * t, mlp_hidden_mults)) + + self.mlp = MLP(n_in = embedding_size, + n_out = n_out, + hidden_size = hidden_dimensions, + drop_rate=mlp_dropout, + use_bn = False, + dropout_first= False) + # self.embeds = nn.Embedding(self.total_tokens, self.dim) #.to(device) + + + + def forward(self, embedded: torch.Tensor, bs: int) -> torch.Tensor: + """Transform the input tensor. + + Args: + embedded : torch.Tensor + embedded fields + + Returns: + torch.Tensor + + """ + mask = torch.zeros((len(embedded),len(embedded)), device=self.device, dtype=torch.bool) + mask[torch.arange(bs), torch.arange(bs)] = 1 + mask[:bs, bs:] = 1 + mask[bs:, bs:] = 1 + + cls_token = torch.unsqueeze( + self.cls_token(torch.ones(embedded.shape[0], dtype=torch.int).to(self.device)), dim=1 + ) + x = torch.cat((cls_token, embedded), dim=1) + x = self.transformer(x, mask_samples=mask) + + # NOTE modified to simple X -> Y supervised model + + # cat_outs = self.mlp1(x[:,:self.num_categories,:]) + # con_outs = self.mlp2(x[:,self.num_categories:,:]) + # return cat_outs, con_outs + + return self.mlp(x[:,0,:]) diff --git a/lightautoml/ml_algo/torch_based/saint/saint.py b/lightautoml/ml_algo/torch_based/saint/saint.py new file mode 100644 index 00000000..03761aae --- /dev/null +++ b/lightautoml/ml_algo/torch_based/saint/saint.py @@ -0,0 +1,144 @@ + +import numpy as np +import torch +import torch.nn.functional as F +from einops import rearrange +from torch import einsum, nn + +def exists(val): + return val is not None + +def default(val, d): + return val if exists(val) else d + +def ff_encodings(x,B): + x_proj = (2. * np.pi * x.unsqueeze(-1)) @ B.t() + return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1) + + +class Residual(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x, **kwargs): + return self.fn(x, **kwargs) + x + +class PreNorm(nn.Module): + def __init__(self, dim, fn): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.fn = fn + + def forward(self, x, **kwargs): + return self.fn(self.norm(x), **kwargs) + +# attention + +class GEGLU(nn.Module): + def forward(self, x): + x, gates = x.chunk(2, dim = -1) + return x * F.gelu(gates) + +class FeedForward(nn.Module): + def __init__(self, dim, mult = 4, dropout = 0.): + super().__init__() + self.net = nn.Sequential( + nn.Linear(dim, int(dim * mult) * 2), + GEGLU(), + nn.Dropout(dropout), + nn.Linear(int(dim * mult), dim) + ) + + def forward(self, x, **kwargs): + return self.net(x, **kwargs) + +class Attention(nn.Module): + def __init__( + self, + dim, + heads = 8, + dim_head = 16, + dropout = 0. + ): + super().__init__() + inner_dim = dim_head * heads + self.heads = heads + self.scale = dim_head ** -0.5 + + self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False) + self.to_out = nn.Linear(inner_dim, dim) + + self.dropout = nn.Dropout(dropout) + + def forward(self, x, mask=None): + h = self.heads + q, k, v = self.to_qkv(x).chunk(3, dim = -1) + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v)) + sim = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale + if mask is not None: + sim[~mask[None, None].expand_as(sim)] = float('-inf') + attn = sim.softmax(dim = -1) + out = einsum('b h i j, b h j d -> b h i d', attn, v) + out = rearrange(out, 'b h n d -> b n (h d)', h = h) + return self.to_out(out) + + +class RowColTransformer(nn.Module): + def __init__(self, dim, nfeats, depth, heads, dim_head, ffn_mult, attn_dropout, ff_dropout, style='col'): + super().__init__() + self.layers = nn.ModuleList([]) + self.mask_embed = nn.Embedding(nfeats, dim) + self.style = style + for _ in range(depth): + if self.style == 'colrow': + self.layers.append(nn.ModuleList([ + PreNorm(dim, Residual(Attention(dim, heads = heads, dim_head = dim_head, dropout = attn_dropout))), + PreNorm(dim, Residual(FeedForward(dim, mult=ffn_mult, dropout = ff_dropout))), + PreNorm(dim*nfeats, Residual(Attention(dim*nfeats, heads = heads, dim_head = dim_head, dropout = attn_dropout))), + PreNorm(dim*nfeats, Residual(FeedForward(dim*nfeats, mult=ffn_mult, dropout = ff_dropout))), + ])) + else: + self.layers.append(nn.ModuleList([ + PreNorm(dim*nfeats, Residual(Attention(dim*nfeats, heads = heads, dim_head = 64, dropout = attn_dropout))), + PreNorm(dim*nfeats, Residual(FeedForward(dim*nfeats, mult=ffn_mult, dropout = ff_dropout))), + ])) + + def forward(self, x, mask_features=None, mask_samples=None): + + _, n, _ = x.shape + if self.style == 'colrow': + for attn1, ff1, attn2, ff2 in self.layers: # type: ignore[code] + x = attn1(x, mask=mask_features) + x = ff1(x) + x = rearrange(x, 'b n d -> 1 b (n d)') + x = attn2(x, mask=mask_samples) + x = ff2(x) + x = rearrange(x, '1 b (n d) -> b n d', n = n) + else: + for attn1, ff1 in self.layers: # type: ignore[code] + x = rearrange(x, 'b n d -> 1 b (n d)') + x = attn1(x) + x = ff1(x) + x = rearrange(x, '1 b (n d) -> b n d', n = n) + return x + + +# transformer +class ColTransformer(nn.Module): + def __init__(self, dim, depth, heads, dim_head, attn_dropout, ff_dropout): + super().__init__() + self.layers = nn.ModuleList([]) + + + for _ in range(depth): + self.layers.append(nn.ModuleList([ + PreNorm(dim, Residual(Attention(dim, heads = heads, dim_head = dim_head, dropout = attn_dropout))), + PreNorm(dim, Residual(FeedForward(dim, dropout = ff_dropout))), + ])) + + def forward(self, x, mask_features=None, mask_samples=None): + for attn, ff in self.layers: + x = attn(x) + x = ff(x) + return x diff --git a/lightautoml/text/nn_model.py b/lightautoml/text/nn_model.py index 3ad54bb9..7508aead 100644 --- a/lightautoml/text/nn_model.py +++ b/lightautoml/text/nn_model.py @@ -228,8 +228,10 @@ def get_logits(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: output = torch.cat(outputs, dim=1) else: output = outputs[0] - - logits = self.torch_model(output) + if 'batch_size' in inp.keys(): + logits = self.torch_model(output,inp['batch_size']) + else: + logits = self.torch_model(output) return logits def get_preds_from_logits(self, logits: torch.Tensor) -> torch.Tensor: diff --git a/lightautoml/text/trainer.py b/lightautoml/text/trainer.py index 240af7e0..79f60438 100644 --- a/lightautoml/text/trainer.py +++ b/lightautoml/text/trainer.py @@ -3,7 +3,7 @@ import logging from copy import deepcopy -from typing import Any +from typing import Any, Iterable from typing import Callable from typing import Dict from typing import List @@ -290,6 +290,7 @@ def __init__( stop_by_metric: bool = False, clip_grad: bool = False, clip_grad_params: Optional[Dict] = None, + **kwargs ): self.net = net @@ -312,7 +313,7 @@ def __init__( self.stop_by_metric = stop_by_metric self.clip_grad = clip_grad self.clip_grad_params = clip_grad_params if clip_grad_params is not None else {} - + self.dataloader = None self.model = None self.optimizer = None @@ -433,10 +434,16 @@ def fit(self, dataloaders: Dict[str, DataLoader]) -> np.ndarray: for epoch in range(self.n_epochs): self.epoch = epoch # train - train_loss = self.train(dataloaders=dataloaders) + if dataloaders['sampler'] is not None: + train_loss = self.train_with_sampler(dataloaders=dataloaders) + else: + train_loss = self.train(dataloaders=dataloaders) train_log.extend(train_loss) # test - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) + if dataloaders['sampler'] is not None: + val_loss, val_data, weights = self.test_with_sampler(dataloader=dataloaders["val"], sampler = dataloaders["sampler"] ) + else: + val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) if self.stop_by_metric: cond = -1 * self.metric(*val_data, weights) else: @@ -461,14 +468,20 @@ def fit(self, dataloaders: Dict[str, DataLoader]) -> np.ndarray: self.se.set_best_params(self.model) if self.is_snap: - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"], snap=True, stage="val") + if dataloaders['sampler'] is not None: + val_loss, val_data, weights = self.test_with_sampler(dataloader=dataloaders["val"],sampler=dataloaders["sampler"], snap=True, stage="val") + else: + val_loss, val_data, weights = self.test(dataloader=dataloaders["val"], snap=True, stage="val") logger.info3( "Result SE, val loss: {vl}, val metric: {me}".format( me=self.metric(*val_data, weights), vl=np.mean(val_loss) ) ) elif self.se.swa: - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) + if dataloaders['sampler'] is not None: + val_loss, val_data, weights = self.test_with_sampler(dataloader=dataloaders["val"], sampler=dataloaders["sampler"]) + else: + val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) logger.info3( "Early stopping: val loss: {vl}, val metric: {me}".format( me=self.metric(*val_data, weights), vl=np.mean(val_loss) @@ -479,6 +492,75 @@ def fit(self, dataloaders: Dict[str, DataLoader]) -> np.ndarray: return val_data[1] + + def train_with_sampler(self, dataloaders: Dict[str, DataLoader]) -> List[float]: + """Training loop. + + Args: + dataloaders: Dict with torch dataloaders. + + Returns: + Loss. + + """ + ################## + loss_log = [] + self.model.train() + running_loss = 0 + c = 0 + + logging_level = get_stdout_level() + if logging_level < logging.INFO and self.verbose and self.verbose_bar: + loader = tqdm(zip(dataloaders["train"],dataloaders['sampler']), desc="train", disable=False) + else: + loader = zip(dataloaders["train"],dataloaders['sampler']) + for sample, candidate_sample in loader: + data = { + i: torch.cat([(sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)), + (candidate_sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else candidate_sample[i].to(self.device))]) + for i in sample.keys() + } + ### NOTE, HERE WE CAN ADD TORCH.UNIQUE + data['batch_size'] = len(sample['label']) + + loss = self.model(data).mean() + if self.apex: + with self.amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + if self.clip_grad: + torch.nn.utils.clip_grad_norm_(self.model.parameters(), **self.clip_grad_params) + self.optimizer.step() + self.optimizer.zero_grad() + + loss = loss.data.cpu().numpy() + loss_log.append(loss) + running_loss += loss + + c += 1 + if self.verbose and self.verbose_bar and logging_level < logging.INFO: + if self.verbose_inside and c % self.verbose_inside == 0: + val_loss, val_data, weights = self.test_with_sampler(dataloader=dataloaders["val"],sampler=dataloaders['sampler']) + if self.stop_by_metric: + cond = -1 * self.metric(*val_data, weights) + else: + cond = np.mean(val_loss) + self.se.update(self.model, cond) + + logger.info3( + "Epoch: {e}, iter: {c}, val loss: {vl}, val metric: {me}".format( + me=self.metric(*val_data, weights), + e=self.epoch, + c=c, + vl=np.mean(val_loss), + ) + ) + loader.set_description("train (loss=%g)" % (running_loss / c)) + + return loss_log + def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: """Training loop. @@ -489,6 +571,7 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: Loss. """ + ################## loss_log = [] self.model.train() running_loss = 0 @@ -558,6 +641,7 @@ def test( Loss, (Target, OOF). """ + ##################### loss_log = [] weights_log = [] self.model.eval() @@ -609,7 +693,75 @@ def test( np.array(weights_log), ) - def predict(self, dataloader: DataLoader, stage: str) -> np.ndarray: + def test_with_sampler( + self, dataloader: DataLoader, sampler: DataLoader,stage: str = "val", snap: bool = False + ) -> Tuple[List[float], Tuple[np.ndarray, np.ndarray]]: + """Testing loop. + + Args: + dataloader: Torch dataloader. + stage: Train, val or test. + snap: Use snapshots. + + Returns: + Loss, (Target, OOF). + + """ + ##################### + loss_log = [] + weights_log = [] + self.model.eval() + pred = [] + target = [] + logging_level = get_stdout_level() + if logging_level < logging.INFO and self.verbose and self.verbose_bar: + loader = tqdm(zip(dataloader,sampler), desc=stage, disable=False) + else: + loader = zip(dataloader,sampler) + + with torch.no_grad(): + for sample, candidate_sample in loader: + data = { + i: torch.cat([(sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)), + (candidate_sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else candidate_sample[i].to(self.device))]) + for i in sample.keys() + } + ### NOTE, HERE WE CAN ADD TORCH.UNIQUE + data['batch_size'] = len(sample['label']) + + if snap: + output = self.se.predict(data) + loss = self.se.forward(data) if stage != "test" else None + else: + output = self.model.predict(data) + loss = self.model(data) if stage != "test" else None + + if stage != "test": + loss = loss.mean().data.cpu().numpy() + + loss_log.append(loss) + + output = output.data.cpu().numpy()[:len(sample['label'])] + target_data = data["label"].data.cpu().numpy()[:len(sample['label'])] + weights = data.get("weight", None) + if weights is not None: + weights = weights.data.cpu().numpy()[:len(sample['label'])] + + pred.append(output) + target.append(target_data) + weights_log.extend(weights) + + self.model.train() + + return ( + loss_log, + ( + np.vstack(target) if len(target[0].shape) == 2 else np.hstack(target), + np.vstack(pred) if len(pred[0].shape) == 2 else np.hstack(pred), + ), + np.array(weights_log), + ) + def predict(self, dataloaders: DataLoader, stage: str) -> np.ndarray: """Predict model. Args: @@ -620,5 +772,8 @@ def predict(self, dataloader: DataLoader, stage: str) -> np.ndarray: Prediction. """ - loss, (target, pred), _ = self.test(stage=stage, snap=self.is_snap, dataloader=dataloader) + if dataloaders['sampler'] is not None: + loss, (target, pred), _ = self.test_with_sampler(stage=stage, snap=self.is_snap, dataloader=dataloaders[stage],sampler=dataloaders['sampler']) + else: + loss, (target, pred), _ = self.test(stage=stage, snap=self.is_snap, dataloader=dataloaders[stage]) return pred diff --git a/lightautoml/text/utils.py b/lightautoml/text/utils.py index 18a8fe70..fe91f806 100644 --- a/lightautoml/text/utils.py +++ b/lightautoml/text/utils.py @@ -66,7 +66,7 @@ def is_shuffle(stage: str) -> bool: Bool value. """ - is_sh = {"train": True, "val": False, "test": False} + is_sh = {"train": True, "val": False, "test": False, "sampler": True} return is_sh[stage] From 799ded248d417d67426c438eb1f9544c1d577f6e Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Mon, 2 Oct 2023 10:21:02 +0000 Subject: [PATCH 42/49] no-verify --- lightautoml/ml_algo/dl_model.py | 4 ++++ lightautoml/text/embed.py | 13 ++++++++++++- lightautoml/text/trainer.py | 5 +++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 38b1f521..bde69ff2 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -52,6 +52,8 @@ DenseEmbeddingFlat, LinearEmbedding, LinearEmbeddingFlat, + MLPContEmbedding, + MLPContEmbeddingFlat, PLREmbedding, PLREmbeddingFlat, SoftEmbedding, @@ -134,6 +136,7 @@ "dense": DenseEmbeddingFlat, "plr": PLREmbeddingFlat, "soft": SoftEmbeddingFlat, + "mlp": MLPContEmbeddingFlat } cont_embedder_by_name = { "cont": LinearEmbedding, @@ -141,6 +144,7 @@ "dense": DenseEmbedding, "plr": PLREmbedding, "soft": SoftEmbedding, + "mlp": MLPContEmbedding, } diff --git a/lightautoml/text/embed.py b/lightautoml/text/embed.py index fa0ea834..b4974d3d 100644 --- a/lightautoml/text/embed.py +++ b/lightautoml/text/embed.py @@ -759,7 +759,18 @@ def forward(self, X: Dict) -> Tensor: """ x = X["cont"] - x = torch.stack([l(x[:, i]) for i, l in enumerate(self.layers)], 1) + # ans = [] + # for i, l in enumerate(self.layers): + # temp = x[:,i].view(x.size(0),-1) + # temp = l(temp) + # x = torch.stack(ans,1) + x = torch.stack([l(x[:, i].view(-1,1)) for i, l in enumerate(self.layers)], 1) if self.flatten_output: return x.view(x.shape[0], -1) return x + +class MLPContEmbeddingFlat(MLPContEmbedding): + """Flatten version of BasicCatEmbedding.""" + + def __init__(self, *args, **kwargs): + super(MLPContEmbeddingFlat, self).__init__(*args, **{**kwargs, **{"flatten_output": True}}) \ No newline at end of file diff --git a/lightautoml/text/trainer.py b/lightautoml/text/trainer.py index 79f60438..1825a54a 100644 --- a/lightautoml/text/trainer.py +++ b/lightautoml/text/trainer.py @@ -1,5 +1,6 @@ """Main pytorch training and prediction class with Snapshots Ensemble.""" +from itertools import cycle import logging from copy import deepcopy @@ -715,9 +716,9 @@ def test_with_sampler( target = [] logging_level = get_stdout_level() if logging_level < logging.INFO and self.verbose and self.verbose_bar: - loader = tqdm(zip(dataloader,sampler), desc=stage, disable=False) + loader = tqdm(zip(dataloader,cycle(sampler)), desc=stage, disable=False) else: - loader = zip(dataloader,sampler) + loader = zip(dataloader,cycle(sampler)) with torch.no_grad(): for sample, candidate_sample in loader: From 57aa2a5e0c1bb0ddd7d1bbf886c74e0ce26cfaef Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Mon, 2 Oct 2023 10:24:14 +0000 Subject: [PATCH 43/49] no-verify --- lightautoml/ml_algo/dl_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index bde69ff2..483bcf84 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -1,7 +1,6 @@ """Neural net for tabular datasets.""" -from itertools import cycle from lightautoml.utils.installation import __validate_extra_deps From ef7316b8e3ca76e0cc758b1bbd5779f8559506c7 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Wed, 4 Oct 2023 09:33:04 +0000 Subject: [PATCH 44/49] starting changing --- lightautoml/ml_algo/torch_based/nn_models.py | 2 +- .../ml_algo/{ => torch_based}/tabnet/utils.py | 0 lightautoml/text/trainer.py | 293 +++++++++--------- 3 files changed, 146 insertions(+), 149 deletions(-) rename lightautoml/ml_algo/{ => torch_based}/tabnet/utils.py (100%) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index f2368853..de698791 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -10,7 +10,7 @@ import torch.nn as nn from .saint.saint import ColTransformer, RowColTransformer -from ..tabnet.utils import TabNetEncoder, _initialize_non_glu +from .tabnet.utils import TabNetEncoder, _initialize_non_glu from .autoint.autoint_utils import AttnInteractionBlock, LeakyGate from .autoint.ghost_norm import GhostBatchNorm from .fttransformer.fttransformer_utils import Transformer diff --git a/lightautoml/ml_algo/tabnet/utils.py b/lightautoml/ml_algo/torch_based/tabnet/utils.py similarity index 100% rename from lightautoml/ml_algo/tabnet/utils.py rename to lightautoml/ml_algo/torch_based/tabnet/utils.py diff --git a/lightautoml/text/trainer.py b/lightautoml/text/trainer.py index 1825a54a..a68d13cc 100644 --- a/lightautoml/text/trainer.py +++ b/lightautoml/text/trainer.py @@ -494,74 +494,6 @@ def fit(self, dataloaders: Dict[str, DataLoader]) -> np.ndarray: return val_data[1] - def train_with_sampler(self, dataloaders: Dict[str, DataLoader]) -> List[float]: - """Training loop. - - Args: - dataloaders: Dict with torch dataloaders. - - Returns: - Loss. - - """ - ################## - loss_log = [] - self.model.train() - running_loss = 0 - c = 0 - - logging_level = get_stdout_level() - if logging_level < logging.INFO and self.verbose and self.verbose_bar: - loader = tqdm(zip(dataloaders["train"],dataloaders['sampler']), desc="train", disable=False) - else: - loader = zip(dataloaders["train"],dataloaders['sampler']) - for sample, candidate_sample in loader: - data = { - i: torch.cat([(sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)), - (candidate_sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else candidate_sample[i].to(self.device))]) - for i in sample.keys() - } - ### NOTE, HERE WE CAN ADD TORCH.UNIQUE - data['batch_size'] = len(sample['label']) - - loss = self.model(data).mean() - if self.apex: - with self.amp.scale_loss(loss, self.optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - if self.clip_grad: - torch.nn.utils.clip_grad_norm_(self.model.parameters(), **self.clip_grad_params) - self.optimizer.step() - self.optimizer.zero_grad() - - loss = loss.data.cpu().numpy() - loss_log.append(loss) - running_loss += loss - - c += 1 - if self.verbose and self.verbose_bar and logging_level < logging.INFO: - if self.verbose_inside and c % self.verbose_inside == 0: - val_loss, val_data, weights = self.test_with_sampler(dataloader=dataloaders["val"],sampler=dataloaders['sampler']) - if self.stop_by_metric: - cond = -1 * self.metric(*val_data, weights) - else: - cond = np.mean(val_loss) - self.se.update(self.model, cond) - - logger.info3( - "Epoch: {e}, iter: {c}, val loss: {vl}, val metric: {me}".format( - me=self.metric(*val_data, weights), - e=self.epoch, - c=c, - vl=np.mean(val_loss), - ) - ) - loader.set_description("train (loss=%g)" % (running_loss / c)) - - return loss_log - def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: """Training loop. @@ -583,16 +515,18 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: loader = tqdm(dataloaders["train"], desc="train", disable=False) else: loader = dataloaders["train"] - for sample in loader: data = { i: (sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)) for i in sample.keys() } + data['batch_size'] = len(sample['label']) + if dataloaders['sampler'] is not None: + data['sampler'] = dataloaders['sampler'] loss = self.model(data).mean() if self.apex: - with self.amp.scale_loss(loss, self.optimizer) as scaled_loss: + with self.amp.scale_loss(loss, self .optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() @@ -609,7 +543,7 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: c += 1 if self.verbose and self.verbose_bar and logging_level < logging.INFO: if self.verbose_inside and c % self.verbose_inside == 0: - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) + val_loss, val_data, weights = self.test(dataloader=dataloaders) if self.stop_by_metric: cond = -1 * self.metric(*val_data, weights) else: @@ -628,74 +562,140 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: return loss_log - def test( - self, dataloader: DataLoader, stage: str = "val", snap: bool = False - ) -> Tuple[List[float], Tuple[np.ndarray, np.ndarray]]: - """Testing loop. - - Args: - dataloader: Torch dataloader. - stage: Train, val or test. - snap: Use snapshots. - - Returns: - Loss, (Target, OOF). - - """ - ##################### - loss_log = [] - weights_log = [] - self.model.eval() - pred = [] - target = [] - logging_level = get_stdout_level() - if logging_level < logging.INFO and self.verbose and self.verbose_bar: - loader = tqdm(dataloader, desc=stage, disable=False) - else: - loader = dataloader - - with torch.no_grad(): - for sample in loader: - data = { - i: (sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)) - for i in sample.keys() - } - - if snap: - output = self.se.predict(data) - loss = self.se.forward(data) if stage != "test" else None - else: - output = self.model.predict(data) - loss = self.model(data) if stage != "test" else None - - if stage != "test": - loss = loss.mean().data.cpu().numpy() - - loss_log.append(loss) - - output = output.data.cpu().numpy() - target_data = data["label"].data.cpu().numpy() - weights = data.get("weight", None) - if weights is not None: - weights = weights.data.cpu().numpy() - - pred.append(output) - target.append(target_data) - weights_log.extend(weights) - - self.model.train() + # def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: + # """Training loop. + + # Args: + # dataloaders: Dict with torch dataloaders. + + # Returns: + # Loss. + + # """ + # ################## + # loss_log = [] + # self.model.train() + # running_loss = 0 + # c = 0 + + # logging_level = get_stdout_level() + # if logging_level < logging.INFO and self.verbose and self.verbose_bar: + # loader = tqdm(dataloaders["train"], desc="train", disable=False) + # else: + # loader = dataloaders["train"] + + # for sample in loader: + # data = { + # i: (sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)) + # for i in sample.keys() + # } + + # loss = self.model(data).mean() + # if self.apex: + # with self.amp.scale_loss(loss, self.optimizer) as scaled_loss: + # scaled_loss.backward() + # else: + # loss.backward() + + # if self.clip_grad: + # torch.nn.utils.clip_grad_norm_(self.model.parameters(), **self.clip_grad_params) + # self.optimizer.step() + # self.optimizer.zero_grad() + + # loss = loss.data.cpu().numpy() + # loss_log.append(loss) + # running_loss += loss + + # c += 1 + # if self.verbose and self.verbose_bar and logging_level < logging.INFO: + # if self.verbose_inside and c % self.verbose_inside == 0: + # val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) + # if self.stop_by_metric: + # cond = -1 * self.metric(*val_data, weights) + # else: + # cond = np.mean(val_loss) + # self.se.update(self.model, cond) + + # logger.info3( + # "Epoch: {e}, iter: {c}, val loss: {vl}, val metric: {me}".format( + # me=self.metric(*val_data, weights), + # e=self.epoch, + # c=c, + # vl=np.mean(val_loss), + # ) + # ) + # loader.set_description("train (loss=%g)" % (running_loss / c)) + + # return loss_log + + # def test( + # self, dataloader: DataLoader, stage: str = "val", snap: bool = False + # ) -> Tuple[List[float], Tuple[np.ndarray, np.ndarray]]: + # """Testing loop. + + # Args: + # dataloader: Torch dataloader. + # stage: Train, val or test. + # snap: Use snapshots. + + # Returns: + # Loss, (Target, OOF). + + # """ + # ##################### + # loss_log = [] + # weights_log = [] + # self.model.eval() + # pred = [] + # target = [] + # logging_level = get_stdout_level() + # if logging_level < logging.INFO and self.verbose and self.verbose_bar: + # loader = tqdm(dataloader, desc=stage, disable=False) + # else: + # loader = dataloader + + # with torch.no_grad(): + # for sample in loader: + # data = { + # i: (sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)) + # for i in sample.keys() + # } + + # if snap: + # output = self.se.predict(data) + # loss = self.se.forward(data) if stage != "test" else None + # else: + # output = self.model.predict(data) + # loss = self.model(data) if stage != "test" else None + + # if stage != "test": + # loss = loss.mean().data.cpu().numpy() + + # loss_log.append(loss) + + # output = output.data.cpu().numpy() + # target_data = data["label"].data.cpu().numpy() + # weights = data.get("weight", None) + # if weights is not None: + # weights = weights.data.cpu().numpy() + + # pred.append(output) + # target.append(target_data) + # weights_log.extend(weights) + + # self.model.train() + + # return ( + # loss_log, + # ( + # np.vstack(target) if len(target[0].shape) == 2 else np.hstack(target), + # np.vstack(pred) if len(pred[0].shape) == 2 else np.hstack(pred), + # ), + # np.array(weights_log), + # ) - return ( - loss_log, - ( - np.vstack(target) if len(target[0].shape) == 2 else np.hstack(target), - np.vstack(pred) if len(pred[0].shape) == 2 else np.hstack(pred), - ), - np.array(weights_log), - ) - - def test_with_sampler( - self, dataloader: DataLoader, sampler: DataLoader,stage: str = "val", snap: bool = False + def test( + self, dataloaders: DataLoader,stage: str = "val", snap: bool = False ) -> Tuple[List[float], Tuple[np.ndarray, np.ndarray]]: """Testing loop. @@ -716,20 +716,20 @@ def test_with_sampler( target = [] logging_level = get_stdout_level() if logging_level < logging.INFO and self.verbose and self.verbose_bar: - loader = tqdm(zip(dataloader,cycle(sampler)), desc=stage, disable=False) + loader = tqdm(dataloaders[stage], desc=stage, disable=False) else: - loader = zip(dataloader,cycle(sampler)) + loader = dataloaders[stage] with torch.no_grad(): for sample, candidate_sample in loader: data = { - i: torch.cat([(sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)), - (candidate_sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else candidate_sample[i].to(self.device))]) + i: sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device) for i in sample.keys() } ### NOTE, HERE WE CAN ADD TORCH.UNIQUE data['batch_size'] = len(sample['label']) - + if dataloaders['sampler'] is not None: + data['sampler'] = dataloaders['sampler'] if snap: output = self.se.predict(data) loss = self.se.forward(data) if stage != "test" else None @@ -745,7 +745,7 @@ def test_with_sampler( output = output.data.cpu().numpy()[:len(sample['label'])] target_data = data["label"].data.cpu().numpy()[:len(sample['label'])] weights = data.get("weight", None) - if weights is not None: + if weights is not None: weights = weights.data.cpu().numpy()[:len(sample['label'])] pred.append(output) @@ -773,8 +773,5 @@ def predict(self, dataloaders: DataLoader, stage: str) -> np.ndarray: Prediction. """ - if dataloaders['sampler'] is not None: - loss, (target, pred), _ = self.test_with_sampler(stage=stage, snap=self.is_snap, dataloader=dataloaders[stage],sampler=dataloaders['sampler']) - else: - loss, (target, pred), _ = self.test(stage=stage, snap=self.is_snap, dataloader=dataloaders[stage]) + loss, (target, pred), _ = self.test(stage=stage, snap=self.is_snap, dataloader=dataloaders) return pred From 50962ddbcaa8ed72d033700e590fe5652d1061b3 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Thu, 12 Oct 2023 15:09:28 +0000 Subject: [PATCH 45/49] more changes --- lightautoml/dataset/base.py | 48 ++++++++ lightautoml/dataset/np_pd_dataset.py | 36 ++++++ lightautoml/dataset/utils.py | 115 ++++++++++++++++++ lightautoml/ml_algo/base.py | 6 +- lightautoml/ml_algo/dl_model.py | 17 ++- .../ml_algo/torch_based/tabnet/utils.py | 4 +- lightautoml/text/nn_model.py | 40 ++++-- lightautoml/text/trainer.py | 63 ++++++---- lightautoml/text/utils.py | 1 + 9 files changed, 285 insertions(+), 45 deletions(-) diff --git a/lightautoml/dataset/base.py b/lightautoml/dataset/base.py index a033e7db..5a107f19 100644 --- a/lightautoml/dataset/base.py +++ b/lightautoml/dataset/base.py @@ -365,6 +365,19 @@ def shape(self) -> Tuple[Optional[int], Optional[int]]: return rows, cols # static methods - how to make 1d slice, 2s slice, concat of feature matrix etc ... + @staticmethod + def _vstack(datasets: Sequence[Any]) -> Any: + """Abstract method - define horizontal stack of feature arrays. + + Args: + datasets: Sequence of feature arrays. + + Returns: # noqa DAR202 + Single feature array. + + """ + raise NotImplementedError("Horizontal Stack not implemented.") + @staticmethod def _hstack(datasets: Sequence[Any]) -> Any: """Abstract method - define horizontal stack of feature arrays. @@ -472,7 +485,42 @@ def concat(cls, datasets: Sequence["LAMLDataset"]) -> "LAMLDataset": dataset.set_data(data, features, roles) return dataset + @classmethod + def vconcat(cls, datasets: Sequence["LAMLDataset"]) -> "LAMLDataset": + """Concat multiple dataset. + + Default behavior - takes empty dataset from datasets[0] + and concat all features from others. + + Args: + datasets: Sequence of datasets. + + Returns: + Concated dataset. + + """ + for check in cls._concat_checks: + check(datasets) + + dataset = datasets[0].empty() + data = [] + features = [*datasets[0].features] + roles = {**datasets[0].roles} + atrs = set(dataset._array_like_attrs) + for ds in datasets: + data.append(ds.data) + for atr in ds._array_like_attrs: + if atr not in atrs: + dataset._array_like_attrs.append(atr) + dataset.__dict__[atr] = ds.__dict__[atr] + atrs.update({atr}) + + data = cls._vstack(data) + dataset.set_data(data, features, roles) + + return dataset + def drop_features(self, droplist: Sequence[str]): """Inplace drop columns from dataset. diff --git a/lightautoml/dataset/np_pd_dataset.py b/lightautoml/dataset/np_pd_dataset.py index 3ec8789c..bffc37c4 100644 --- a/lightautoml/dataset/np_pd_dataset.py +++ b/lightautoml/dataset/np_pd_dataset.py @@ -212,6 +212,18 @@ def _hstack(datasets: Sequence[np.ndarray]) -> np.ndarray: """ return np.hstack(datasets) + @staticmethod + def _vstack(datasets: Sequence[np.ndarray]) -> np.ndarray: + """Concatenate function for numpy arrays. + + Args: + datasets: Sequence of np.ndarray. + + Returns: + Stacked features array. + + """ + return np.vstack(datasets) @staticmethod def _get_rows(data: np.ndarray, k: IntIdx) -> np.ndarray: @@ -400,6 +412,17 @@ def _hstack(datasets: Sequence[Union[sparse.csr_matrix, np.ndarray]]) -> sparse. """ return sparse.hstack(datasets, format="csr") + def _vstack(datasets: Sequence[Union[sparse.csr_matrix, np.ndarray]]) -> sparse.csr_matrix: + """Concatenate function for sparse and numpy arrays. + + Args: + datasets: Sequence of csr_matrix or np.ndarray. + + Returns: + Sparse matrix. + + """ + return sparse.vstack(datasets, format="csr") def __init__( self, @@ -609,6 +632,19 @@ def _hstack(datasets: Sequence[DataFrame]) -> DataFrame: """ return pd.concat(datasets, axis=1) + + @staticmethod + def _vstack(datasets: Sequence[DataFrame]) -> DataFrame: + """Define how to concat features arrays. + + Args: + datasets: Sequence of tables. + + Returns: + concatenated table. + + """ + return pd.concat(datasets, axis=0) @staticmethod def _get_rows(data: DataFrame, k: IntIdx) -> FrameOrSeries: diff --git a/lightautoml/dataset/utils.py b/lightautoml/dataset/utils.py index 5f3410e5..158e9fa0 100644 --- a/lightautoml/dataset/utils.py +++ b/lightautoml/dataset/utils.py @@ -158,3 +158,118 @@ def concatenate(datasets: Sequence[LAMLDataset]) -> LAMLDataset: datasets = [datasets[n]] + [x for (y, x) in enumerate(datasets) if n != y] return conc(datasets) + + + +def get_common_vconcat( + datasets: Sequence[LAMLDataset], +) -> Tuple[Callable, Optional[type]]: + """Get concatenation function for datasets of different types. + + Takes multiple datasets as input and check, + if is's ok to concatenate it and return function. + + Args: + datasets: Sequence of datasets. + + Returns: + Function, that is able to concatenate datasets. + + """ + # TODO: Add pandas + numpy via transforming to numpy? + dataset_types = set([type(x) for x in datasets]) + + # general - if single type, concatenation for that type + if len(dataset_types) == 1: + klass = list(dataset_types)[0] + return klass.vconcat, None + + # np and sparse goes to sparse + elif dataset_types == {NumpyDataset, CSRSparseDataset}: + return CSRSparseDataset.vconcat, CSRSparseDataset + + elif dataset_types == {NumpyDataset, PandasDataset}: + return numpy_and_pandas_vconcat, None + + elif (dataset_types == {NumpyDataset, SeqNumpyPandasDataset}) or ( + dataset_types == {PandasDataset, SeqNumpyPandasDataset} + ): + return numpy_or_pandas_and_seq_vconcat, None + + raise TypeError("Unable to concatenate dataset types {0}".format(list(dataset_types))) + + +def numpy_and_pandas_vconcat(datasets: Sequence[Union[NumpyDataset, PandasDataset]]) -> PandasDataset: + """Concat of numpy and pandas dataset. + + Args: + datasets: Sequence of datasets to concatenate. + + Returns: + Concatenated dataset. + + """ + datasets = [x.to_pandas() for x in datasets] + + return PandasDataset.vconcat(datasets) + + +def numpy_or_pandas_and_seq_vconcat( + datasets: Sequence[Union[NumpyDataset, PandasDataset, SeqNumpyPandasDataset]] +) -> Union[NumpyDataset, PandasDataset]: + """Concat plain and sequential dataset. + + If both datasets have same size then concat them as plain, otherwise include seq dataset inside plain one. + + Args: + datasets: one plain and one seq dataset. + + Returns: + Concatenated dataset. + + """ + assert len(datasets) == 2, "should be 1 sequential and 1 plain dataset" + # get 1 numpy / pandas dataset + for n, dataset in enumerate(datasets): + if type(dataset) == SeqNumpyPandasDataset: + seq_dataset = dataset + else: + plain_dataset = dataset + + if len(seq_dataset.data) == len(plain_dataset): + return SeqNumpyPandasDataset.vconcat([seq_dataset, plain_dataset.to_pandas()]) + else: + if hasattr(plain_dataset, "seq_data"): + plain_dataset.seq_data[seq_dataset.name] = seq_dataset + else: + plain_dataset.seq_data = {seq_dataset.name: seq_dataset} + + return plain_dataset + + +def vconcatenate(datasets: Sequence[LAMLDataset]) -> LAMLDataset: + """Dataset concatenation function. + + Check if datasets have common concat function and then apply. + Assume to take target/folds/weights etc from first one. + + Args: + datasets: Sequence of datasets. + + Returns: + Dataset with concatenated features. + + """ + conc, klass = get_common_vconcat([ds for ds in datasets if ds is not None]) + + # this part is made to avoid setting first dataset of required type + if klass is not None: + + n = 0 + for n, ds in enumerate(datasets): + if type(ds) is klass: + break + + datasets = [datasets[n]] + [x for (y, x) in enumerate(datasets) if n != y] + + return conc(datasets) \ No newline at end of file diff --git a/lightautoml/ml_algo/base.py b/lightautoml/ml_algo/base.py index 0dec5aba..74c3c6da 100755 --- a/lightautoml/ml_algo/base.py +++ b/lightautoml/ml_algo/base.py @@ -16,7 +16,7 @@ import numpy as np -from lightautoml.validation.base import TrainValidIterator +from lightautoml.validation.base import HoldoutIterator, TrainValidIterator from ..dataset.base import LAMLDataset from ..dataset.np_pd_dataset import CSRSparseDataset @@ -271,8 +271,8 @@ def fit_predict(self, train_valid_iterator: TrainValidIterator) -> NumpyDataset: "===== Start working with \x1b[1mfold {}\x1b[0m for \x1b[1m{}\x1b[0m =====".format(n, self._name) ) self.timer.set_control_point() - - model, pred = self.fit_predict_single_fold(train, valid) + self.params['is_holdout'] = isinstance(train_valid_iterator,HoldoutIterator) + model, pred = self.fit_predict_single_fold(train, valid, 0) self.models.append(model) preds_arr[idx] += pred.reshape((pred.shape[0], -1)) counter_arr[idx] += 1 diff --git a/lightautoml/ml_algo/dl_model.py b/lightautoml/ml_algo/dl_model.py index 483bcf84..118b564a 100644 --- a/lightautoml/ml_algo/dl_model.py +++ b/lightautoml/ml_algo/dl_model.py @@ -1,6 +1,8 @@ """Neural net for tabular datasets.""" +from lightautoml.dataset.base import LAMLDataset +from lightautoml.dataset.utils import vconcatenate from lightautoml.utils.installation import __validate_extra_deps @@ -506,7 +508,7 @@ def init_params_on_input(self, train_valid_iterator) -> dict: ) return suggested_params - def get_dataloaders_from_dicts(self, data_dict: Dict): + def get_dataloaders_from_dicts(self, data_dict: Dict, n : int =0): """Construct dataloaders depending on stage. Args: @@ -532,6 +534,7 @@ def get_dataloaders_from_dicts(self, data_dict: Dict): } datasets[stage] = self.train_params["dataset"]( + fold = n, data=data, y=value.target.values if stage != "test" else np.ones(len(value.data)), w=value.weights.values if value.weights is not None else np.ones(len(value.data)), @@ -573,7 +576,7 @@ def fit_predict(self, train_valid_iterator: TrainValidIterator) -> NumpyDataset: self.params = self._init_params_on_input(train_valid_iterator) return super().fit_predict(train_valid_iterator) - def fit_predict_single_fold(self, train: TabularDataset, valid: TabularDataset): + def fit_predict_single_fold(self, train: TabularDataset, valid: TabularDataset, n=0): """Implements training and prediction on single fold. Args: @@ -589,16 +592,20 @@ def fit_predict_single_fold(self, train: TabularDataset, valid: TabularDataset): target = train.target self.params["bias"] = self.get_mean_target(target, task_name) if self.params["init_bias"] else None - model = self._infer_params(train) + if self.params['is_holdout']: + ds = train + else: + ds = vconcatenate([train,valid]) + model = self._infer_params(ds) model_path = ( os.path.join(self.path_to_save, f"{uuid.uuid4()}.pickle") if self.path_to_save is not None else None ) # init datasets if self.use_sampler: - dataloaders = self.get_dataloaders_from_dicts({"train": train.to_pandas(), "val": valid.to_pandas(),"sampler": train.to_pandas()}) + dataloaders = self.get_dataloaders_from_dicts({"train": train.to_pandas(), "val": valid.to_pandas(),"sampler": train.to_pandas()},n) else: - dataloaders = self.get_dataloaders_from_dicts({"train": train.to_pandas(), "val": valid.to_pandas()}) + dataloaders = self.get_dataloaders_from_dicts({"train": train.to_pandas(), "val": valid.to_pandas()},n) dataloaders['sampler'] = None val_pred = model.fit(dataloaders) diff --git a/lightautoml/ml_algo/torch_based/tabnet/utils.py b/lightautoml/ml_algo/torch_based/tabnet/utils.py index 8530be5d..dc2f9d75 100644 --- a/lightautoml/ml_algo/torch_based/tabnet/utils.py +++ b/lightautoml/ml_algo/torch_based/tabnet/utils.py @@ -2,8 +2,8 @@ import torch import numpy as np import torch.nn as nn -from ..torch_based.node_nn_model import Entmax15, Sparsemax -from ..torch_based.autoint.ghost_norm import GhostBatchNorm +from ..node_nn_model import Entmax15, Sparsemax +from ..autoint.ghost_norm import GhostBatchNorm def _initialize_non_glu(module, input_dim, output_dim): diff --git a/lightautoml/text/nn_model.py b/lightautoml/text/nn_model.py index 7508aead..131901d1 100644 --- a/lightautoml/text/nn_model.py +++ b/lightautoml/text/nn_model.py @@ -12,7 +12,7 @@ import torch.nn as nn from ..tasks.base import Task - +from .utils import _dtypes_mapping logger = logging.getLogger(__name__) @@ -31,6 +31,7 @@ class UniversalDataset: def __init__( self, + fold: int, data: Dict[str, np.ndarray], y: np.ndarray, w: Optional[np.ndarray] = None, @@ -38,6 +39,7 @@ def __init__( max_length: int = 256, stage: str = "test", ): + self.fold = fold self.data = data self.y = y self.w = w @@ -49,7 +51,7 @@ def __len__(self) -> int: return len(self.y) def __getitem__(self, index: int) -> Dict[str, np.ndarray]: - res = {"label": self.y[index]} + res = {"fold":self.fold ,"label": self.y[index]} res.update({key: value[index] for key, value in self.data.items() if key != "text"}) if (self.tokenizer is not None) and ("text" in self.data): sent = self.data["text"][index, 0] # only one column @@ -85,7 +87,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward-pass.""" x = torch.clamp(x, self.min_v, self.max_v) return x - + class TorchUniversalModel(nn.Module): """Mixed data model. @@ -133,6 +135,7 @@ def __init__( self.cont_embedder = None self.cat_embedder = None self.text_embedder = None + self.sampler = None n_in = 0 if cont_embedder_ is not None: @@ -212,9 +215,12 @@ def _init_last_layers(self, torch_model, bias, use_skip=False): except: logger.info3("Last linear layer not founded, so init_bias=False") - def get_logits(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: + + def get_logits(self, inp: Dict[str, torch.Tensor],efficient_bs:int = None) -> torch.Tensor: """Forward-pass of model with embeddings.""" outputs = [] + + if self.cont_embedder is not None: outputs.append(self.cont_embedder(inp)) @@ -228,8 +234,8 @@ def get_logits(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: output = torch.cat(outputs, dim=1) else: output = outputs[0] - if 'batch_size' in inp.keys(): - logits = self.torch_model(output,inp['batch_size']) + if efficient_bs is not None: + logits = self.torch_model(output,efficient_bs) else: logits = self.torch_model(output) return logits @@ -248,7 +254,16 @@ def get_preds_from_logits(self, logits: torch.Tensor) -> torch.Tensor: def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: """Forward-pass with output loss.""" - x = self.get_logits(inp) + efficient_bs = None + if inp['sampler'] is not None: + efficient_bs = len(inp['label']) + candidate_sample = next(inp['sampler']) + inp = { + i: torch.cat([inp[i], + (candidate_sample[i].long().to(self.torch_model.device) if _dtypes_mapping[i] == "long" else candidate_sample[i].to(self.torch_model.device))]) + for i in set(inp.keys())-set(['sampler']) + } + x = self.get_logits(inp,efficient_bs) if not self.loss_on_logits: x = self.get_preds_from_logits(x) @@ -257,6 +272,15 @@ def forward(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: def predict(self, inp: Dict[str, torch.Tensor]) -> torch.Tensor: """Prediction.""" - x = self.get_logits(inp) + efficient_bs = None + if inp['sampler'] is not None: + efficient_bs = len(inp['label']) + candidate_sample = next(inp['sampler']) + inp = { + i: torch.cat([inp[i], + (candidate_sample[i].long().to(self.torch_model.device) if _dtypes_mapping[i] == "long" else candidate_sample[i].to(self.torch_model.device))]) + for i in set(inp.keys())-set(['sampler']) + } + x = self.get_logits(inp,efficient_bs) x = self.get_preds_from_logits(x) return x diff --git a/lightautoml/text/trainer.py b/lightautoml/text/trainer.py index a68d13cc..dc127c1b 100644 --- a/lightautoml/text/trainer.py +++ b/lightautoml/text/trainer.py @@ -237,6 +237,20 @@ def load_state_dict(self, weights: Dict, model: nn.Module): return self +class InfIterator(object): + def __init__(self, dataloader): + self.dl = dataloader + self.it = iter(self.dl) + + def __iter__(self): + return self + + def __next__(self): + try: + return next(self.it) + except StopIteration: + self.it = iter(self.dl) + return next(self.it) class Trainer: """Torch main trainer class. @@ -435,16 +449,11 @@ def fit(self, dataloaders: Dict[str, DataLoader]) -> np.ndarray: for epoch in range(self.n_epochs): self.epoch = epoch # train - if dataloaders['sampler'] is not None: - train_loss = self.train_with_sampler(dataloaders=dataloaders) - else: - train_loss = self.train(dataloaders=dataloaders) + train_loss = self.train(dataloaders=dataloaders) train_log.extend(train_loss) # test - if dataloaders['sampler'] is not None: - val_loss, val_data, weights = self.test_with_sampler(dataloader=dataloaders["val"], sampler = dataloaders["sampler"] ) - else: - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) + + val_loss, val_data, weights = self.test(dataloaders=dataloaders) if self.stop_by_metric: cond = -1 * self.metric(*val_data, weights) else: @@ -469,20 +478,14 @@ def fit(self, dataloaders: Dict[str, DataLoader]) -> np.ndarray: self.se.set_best_params(self.model) if self.is_snap: - if dataloaders['sampler'] is not None: - val_loss, val_data, weights = self.test_with_sampler(dataloader=dataloaders["val"],sampler=dataloaders["sampler"], snap=True, stage="val") - else: - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"], snap=True, stage="val") + val_loss, val_data, weights = self.test(dataloaders=dataloaders, snap=True, stage="val") logger.info3( "Result SE, val loss: {vl}, val metric: {me}".format( me=self.metric(*val_data, weights), vl=np.mean(val_loss) ) ) elif self.se.swa: - if dataloaders['sampler'] is not None: - val_loss, val_data, weights = self.test_with_sampler(dataloader=dataloaders["val"], sampler=dataloaders["sampler"]) - else: - val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) + val_loss, val_data, weights = self.test(dataloaders=dataloaders) logger.info3( "Early stopping: val loss: {vl}, val metric: {me}".format( me=self.metric(*val_data, weights), vl=np.mean(val_loss) @@ -515,15 +518,20 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: loader = tqdm(dataloaders["train"], desc="train", disable=False) else: loader = dataloaders["train"] + sampler = None + if dataloaders['sampler'] is not None: + # data['batch_size'] = len(sample['label']) + sampler = InfIterator(dataloaders['sampler']) for sample in loader: data = { i: (sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)) for i in sample.keys() } - data['batch_size'] = len(sample['label']) - if dataloaders['sampler'] is not None: - data['sampler'] = dataloaders['sampler'] - + # data['batch_size'] = len(sample['label']) + # if dataloaders['sampler'] is not None: + # # data['batch_size'] = len(sample['label']) + # data['sampler'] = dataloaders['sampler'] + data['sampler'] = sampler loss = self.model(data).mean() if self.apex: with self.amp.scale_loss(loss, self .optimizer) as scaled_loss: @@ -543,7 +551,7 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: c += 1 if self.verbose and self.verbose_bar and logging_level < logging.INFO: if self.verbose_inside and c % self.verbose_inside == 0: - val_loss, val_data, weights = self.test(dataloader=dataloaders) + val_loss, val_data, weights = self.test(dataloaders=dataloaders) if self.stop_by_metric: cond = -1 * self.metric(*val_data, weights) else: @@ -719,17 +727,18 @@ def test( loader = tqdm(dataloaders[stage], desc=stage, disable=False) else: loader = dataloaders[stage] - + sampler = None + if dataloaders['sampler'] is not None: + # data['batch_size'] = len(sample['label']) + sampler = InfIterator(dataloaders['sampler']) with torch.no_grad(): - for sample, candidate_sample in loader: + for sample in loader: data = { i: sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device) for i in sample.keys() } + data['sampler'] = sampler ### NOTE, HERE WE CAN ADD TORCH.UNIQUE - data['batch_size'] = len(sample['label']) - if dataloaders['sampler'] is not None: - data['sampler'] = dataloaders['sampler'] if snap: output = self.se.predict(data) loss = self.se.forward(data) if stage != "test" else None @@ -773,5 +782,5 @@ def predict(self, dataloaders: DataLoader, stage: str) -> np.ndarray: Prediction. """ - loss, (target, pred), _ = self.test(stage=stage, snap=self.is_snap, dataloader=dataloaders) + loss, (target, pred), _ = self.test(stage=stage, snap=self.is_snap, dataloaders=dataloaders) return pred diff --git a/lightautoml/text/utils.py b/lightautoml/text/utils.py index fe91f806..d1cc3d0d 100644 --- a/lightautoml/text/utils.py +++ b/lightautoml/text/utils.py @@ -23,6 +23,7 @@ "token_type_ids": "long", "text": "float", # embeddings "length": "long", + "fold": "long" } From 586ae5946645b8a472633b5cc31db12fd0a86d1f Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Mon, 16 Oct 2023 10:19:20 +0000 Subject: [PATCH 46/49] Descr --- lightautoml/ml_algo/torch_based/nn_models.py | 121 ++++++--- .../ml_algo/torch_based/saint/saint.py | 144 ---------- .../ml_algo/torch_based/saint/saint_utils.py | 256 ++++++++++++++++++ 3 files changed, 334 insertions(+), 187 deletions(-) delete mode 100644 lightautoml/ml_algo/torch_based/saint/saint.py create mode 100644 lightautoml/ml_algo/torch_based/saint/saint_utils.py diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index de698791..c4dc20fa 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -9,7 +9,7 @@ import torch import torch.nn as nn -from .saint.saint import ColTransformer, RowColTransformer +from .saint.saint_utils import ColTransformer, RowColTransformer from .tabnet.utils import TabNetEncoder, _initialize_non_glu from .autoint.autoint_utils import AttnInteractionBlock, LeakyGate from .autoint.ghost_norm import GhostBatchNorm @@ -1141,7 +1141,7 @@ def __init__( epsilon=1e-15, virtual_batch_size=128, momentum=0.02, - mask_type="entemax", + mask_type="entmax", group_attention_matrix=None, **kwargs, ): @@ -1192,61 +1192,91 @@ def forward_masks(self, x): class SAINT(nn.Module): + """Implementation of Saint from https://github.com/yandex-research/tabular-dl-tabr. + + Args: + n_in : int + Number of features + n_out : int or list of int for multi task classification + Dimension of network output + embedding_size : embedding_size + Dimension of the embedding + depth : int + Number of Attention Blocks. + heads : int + Number of heads in Attention. + dim_head : int + Attention head dimension. + mlp_hidden_mults : int | tuple[int] + Multiply hidden state of MLP. + ffn_mult : int + Multiply hidden state of feed forward layer. + attn_dropout : float + Post-Attention dropout. + ff_dropout : int + Feed-Forward Dropout. + mlp_dropout : float + MLP Dropout. + attentiontype : str + Either "colrow" or "row" : this is the masking attention to use + device : torch.device + kwargs : kwargs + """ + def __init__( self, n_in: int, n_out: int = 1, embedding_size: int = 10, - depth: int =2, + depth: int = 2, heads: int = 8, - dim_head = 16, - mlp_hidden_mults = (4, 2), - ffn_mult = 4, - attn_dropout = 0., - ff_dropout = 0., - mlp_dropout =0., - attentiontype = 'colrow', + dim_head=16, + mlp_hidden_mults=(4, 2), + ffn_mult=4, + attn_dropout=0.0, + ff_dropout=0.0, + mlp_dropout=0.0, + attentiontype="colrow", device: torch.device = torch.device("cuda:0"), - **kwargs - ): + **kwargs, + ): super().__init__() self.device = device self.cls_token = nn.Embedding(2, embedding_size) self.attentiontype = attentiontype - if attentiontype == 'col': + if attentiontype == "col": self.transformer = ColTransformer( - dim = embedding_size, - depth = depth, - heads = heads, - dim_head = dim_head, - attn_dropout = attn_dropout, - ff_dropout = ff_dropout + dim=embedding_size, + depth=depth, + heads=heads, + dim_head=dim_head, + attn_dropout=attn_dropout, + ff_dropout=ff_dropout, ) - elif attentiontype in ['row','colrow'] : + elif attentiontype in ["row", "colrow"]: self.transformer = RowColTransformer( - dim = embedding_size, - nfeats= n_in+1, #num featurs - depth = depth, - heads = heads, - dim_head = dim_head, - ffn_mult = ffn_mult, - attn_dropout = attn_dropout, - ff_dropout = ff_dropout, - style = attentiontype + dim=embedding_size, + nfeats=n_in + 1, # num featurs + depth=depth, + heads=heads, + dim_head=dim_head, + ffn_mult=ffn_mult, + attn_dropout=attn_dropout, + ff_dropout=ff_dropout, + style=attentiontype, ) - - l = (n_in+1) // 8 #input_size = (dim * self.num_categories) + (dim * num_continuous) - hidden_dimensions = list(map(lambda t: l * t, mlp_hidden_mults)) - - self.mlp = MLP(n_in = embedding_size, - n_out = n_out, - hidden_size = hidden_dimensions, - drop_rate=mlp_dropout, - use_bn = False, - dropout_first= False) - # self.embeds = nn.Embedding(self.total_tokens, self.dim) #.to(device) + l_rate = (n_in + 1) // 8 # input_size = (dim * self.num_categories) + (dim * num_continuous) + hidden_dimensions = list(map(lambda t: l_rate * t, mlp_hidden_mults)) + self.mlp = MLP( + n_in=embedding_size, + n_out=n_out, + hidden_size=hidden_dimensions, + drop_rate=mlp_dropout, + use_bn=False, + dropout_first=False, + ) def forward(self, embedded: torch.Tensor, bs: int) -> torch.Tensor: """Transform the input tensor. @@ -1254,15 +1284,20 @@ def forward(self, embedded: torch.Tensor, bs: int) -> torch.Tensor: Args: embedded : torch.Tensor embedded fields + bs : batch size Returns: torch.Tensor """ - mask = torch.zeros((len(embedded),len(embedded)), device=self.device, dtype=torch.bool) + mask = torch.zeros((len(embedded), len(embedded)), device=self.device, dtype=torch.bool) mask[torch.arange(bs), torch.arange(bs)] = 1 + # NOTE that it was: + # mask[:bs, bs:] = 1 + # mask[bs:, bs:] = 1 + # probably misprint mask[:bs, bs:] = 1 - mask[bs:, bs:] = 1 + mask[bs:, :bs] = 1 cls_token = torch.unsqueeze( self.cls_token(torch.ones(embedded.shape[0], dtype=torch.int).to(self.device)), dim=1 @@ -1276,4 +1311,4 @@ def forward(self, embedded: torch.Tensor, bs: int) -> torch.Tensor: # con_outs = self.mlp2(x[:,self.num_categories:,:]) # return cat_outs, con_outs - return self.mlp(x[:,0,:]) + return self.mlp(x[:, 0, :]) diff --git a/lightautoml/ml_algo/torch_based/saint/saint.py b/lightautoml/ml_algo/torch_based/saint/saint.py deleted file mode 100644 index 03761aae..00000000 --- a/lightautoml/ml_algo/torch_based/saint/saint.py +++ /dev/null @@ -1,144 +0,0 @@ - -import numpy as np -import torch -import torch.nn.functional as F -from einops import rearrange -from torch import einsum, nn - -def exists(val): - return val is not None - -def default(val, d): - return val if exists(val) else d - -def ff_encodings(x,B): - x_proj = (2. * np.pi * x.unsqueeze(-1)) @ B.t() - return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1) - - -class Residual(nn.Module): - def __init__(self, fn): - super().__init__() - self.fn = fn - - def forward(self, x, **kwargs): - return self.fn(x, **kwargs) + x - -class PreNorm(nn.Module): - def __init__(self, dim, fn): - super().__init__() - self.norm = nn.LayerNorm(dim) - self.fn = fn - - def forward(self, x, **kwargs): - return self.fn(self.norm(x), **kwargs) - -# attention - -class GEGLU(nn.Module): - def forward(self, x): - x, gates = x.chunk(2, dim = -1) - return x * F.gelu(gates) - -class FeedForward(nn.Module): - def __init__(self, dim, mult = 4, dropout = 0.): - super().__init__() - self.net = nn.Sequential( - nn.Linear(dim, int(dim * mult) * 2), - GEGLU(), - nn.Dropout(dropout), - nn.Linear(int(dim * mult), dim) - ) - - def forward(self, x, **kwargs): - return self.net(x, **kwargs) - -class Attention(nn.Module): - def __init__( - self, - dim, - heads = 8, - dim_head = 16, - dropout = 0. - ): - super().__init__() - inner_dim = dim_head * heads - self.heads = heads - self.scale = dim_head ** -0.5 - - self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False) - self.to_out = nn.Linear(inner_dim, dim) - - self.dropout = nn.Dropout(dropout) - - def forward(self, x, mask=None): - h = self.heads - q, k, v = self.to_qkv(x).chunk(3, dim = -1) - q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v)) - sim = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale - if mask is not None: - sim[~mask[None, None].expand_as(sim)] = float('-inf') - attn = sim.softmax(dim = -1) - out = einsum('b h i j, b h j d -> b h i d', attn, v) - out = rearrange(out, 'b h n d -> b n (h d)', h = h) - return self.to_out(out) - - -class RowColTransformer(nn.Module): - def __init__(self, dim, nfeats, depth, heads, dim_head, ffn_mult, attn_dropout, ff_dropout, style='col'): - super().__init__() - self.layers = nn.ModuleList([]) - self.mask_embed = nn.Embedding(nfeats, dim) - self.style = style - for _ in range(depth): - if self.style == 'colrow': - self.layers.append(nn.ModuleList([ - PreNorm(dim, Residual(Attention(dim, heads = heads, dim_head = dim_head, dropout = attn_dropout))), - PreNorm(dim, Residual(FeedForward(dim, mult=ffn_mult, dropout = ff_dropout))), - PreNorm(dim*nfeats, Residual(Attention(dim*nfeats, heads = heads, dim_head = dim_head, dropout = attn_dropout))), - PreNorm(dim*nfeats, Residual(FeedForward(dim*nfeats, mult=ffn_mult, dropout = ff_dropout))), - ])) - else: - self.layers.append(nn.ModuleList([ - PreNorm(dim*nfeats, Residual(Attention(dim*nfeats, heads = heads, dim_head = 64, dropout = attn_dropout))), - PreNorm(dim*nfeats, Residual(FeedForward(dim*nfeats, mult=ffn_mult, dropout = ff_dropout))), - ])) - - def forward(self, x, mask_features=None, mask_samples=None): - - _, n, _ = x.shape - if self.style == 'colrow': - for attn1, ff1, attn2, ff2 in self.layers: # type: ignore[code] - x = attn1(x, mask=mask_features) - x = ff1(x) - x = rearrange(x, 'b n d -> 1 b (n d)') - x = attn2(x, mask=mask_samples) - x = ff2(x) - x = rearrange(x, '1 b (n d) -> b n d', n = n) - else: - for attn1, ff1 in self.layers: # type: ignore[code] - x = rearrange(x, 'b n d -> 1 b (n d)') - x = attn1(x) - x = ff1(x) - x = rearrange(x, '1 b (n d) -> b n d', n = n) - return x - - -# transformer -class ColTransformer(nn.Module): - def __init__(self, dim, depth, heads, dim_head, attn_dropout, ff_dropout): - super().__init__() - self.layers = nn.ModuleList([]) - - - for _ in range(depth): - self.layers.append(nn.ModuleList([ - PreNorm(dim, Residual(Attention(dim, heads = heads, dim_head = dim_head, dropout = attn_dropout))), - PreNorm(dim, Residual(FeedForward(dim, dropout = ff_dropout))), - ])) - - def forward(self, x, mask_features=None, mask_samples=None): - for attn, ff in self.layers: - x = attn(x) - x = ff(x) - return x diff --git a/lightautoml/ml_algo/torch_based/saint/saint_utils.py b/lightautoml/ml_algo/torch_based/saint/saint_utils.py new file mode 100644 index 00000000..d9dea227 --- /dev/null +++ b/lightautoml/ml_algo/torch_based/saint/saint_utils.py @@ -0,0 +1,256 @@ +"""Saint utils.""" + +from einops import rearrange +from torch import einsum, nn + +from ..fttransformer.fttransformer_utils import GEGLU + + +class Residual(nn.Module): + """Residual connection layer. + + Args: + fn : function to apply + """ + + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x, **kwargs): + """Forward-pass.""" + return self.fn(x, **kwargs) + x + + +class PreNorm(nn.Module): + """Normalization connection layer. + + Args: + fn : function to apply + """ + + def __init__(self, dim, fn): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.fn = fn + + def forward(self, x, **kwargs): + """Forward-pass.""" + return self.fn(self.norm(x), **kwargs) + + +# attention + + +class FeedForward(nn.Module): + """Feedforward for Transformer block. + + Args: + dim: Embeddings dimension. + mult: multiply hidden state dim. + dropout: Post-Attention dropout. + """ + + def __init__(self, dim, mult=4, dropout=0.0): + super().__init__() + self.net = nn.Sequential( + nn.Linear(dim, int(dim * mult) * 2), GEGLU(), nn.Dropout(dropout), nn.Linear(int(dim * mult), dim) + ) + + def forward(self, x, **kwargs): + """Forward-pass. + + Args: + x : torch.Tensor + 3-d tensor; for example, embedded numeric and/or categorical values, + or the output of a previous attention layer. + kwargs: kwargs + + Returns: + torch.Tensor + + """ + return self.net(x, **kwargs) + + +class Attention(nn.Module): + """Attention Block. + + Args: + dim: Embeddings dimension. + heads: Number of heads in Attention. + dim_head: Attention head dimension. + dropout: Post-Attention dropout. + """ + + def __init__(self, dim, heads=8, dim_head=16, dropout=0.0): + super().__init__() + inner_dim = dim_head * heads + self.heads = heads + self.scale = dim_head ** -0.5 + + self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False) + self.to_out = nn.Linear(inner_dim, dim) + + self.dropout = nn.Dropout(dropout) + + def forward(self, x, mask=None): + """Transform the input tensor with attention. + + Args: + x : torch.Tensor + 3-d tensor; for example, embedded numeric and/or categorical values, + or the output of a previous attention layer. + mask: torch.Tensor + + Returns: + torch.Tensor + + """ + h = self.heads + q, k, v = self.to_qkv(x).chunk(3, dim=-1) + q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v)) + sim = einsum("b h i d, b h j d -> b h i j", q, k) * self.scale + if mask is not None: + sim[~mask[None, None].expand_as(sim)] = float("-inf") + attn = sim.softmax(dim=-1) + out = einsum("b h i j, b h j d -> b h i d", attn, v) + out = rearrange(out, "b h n d -> b n (h d)", h=h) + return self.to_out(out) + + +class RowColTransformer(nn.Module): + """Transformer Block. + + Args: + dim: Embeddings dimension. + nfeats: Number of features. + depth: Number of Attention Blocks. + heads: Number of heads in Attention. + dim_head: Attention head dimension. + ffn_mult: multiply hidden state of feed forward layer. + attn_dropout: Post-Attention dropout. + ff_dropout: Feed-Forward Dropout. + style: attention style: 'col' or 'colrow' + """ + + def __init__(self, dim, nfeats, depth, heads, dim_head, ffn_mult, attn_dropout, ff_dropout, style="col"): + super().__init__() + self.layers = nn.ModuleList([]) + self.mask_embed = nn.Embedding(nfeats, dim) + self.style = style + for _ in range(depth): + if self.style == "colrow": + self.layers.append( + nn.ModuleList( + [ + PreNorm( + dim, Residual(Attention(dim, heads=heads, dim_head=dim_head, dropout=attn_dropout)) + ), + PreNorm(dim, Residual(FeedForward(dim, mult=ffn_mult, dropout=ff_dropout))), + PreNorm( + dim * nfeats, + Residual(Attention(dim * nfeats, heads=heads, dim_head=dim_head, dropout=attn_dropout)), + ), + PreNorm( + dim * nfeats, Residual(FeedForward(dim * nfeats, mult=ffn_mult, dropout=ff_dropout)) + ), + ] + ) + ) + else: + self.layers.append( + nn.ModuleList( + [ + PreNorm( + dim * nfeats, + Residual(Attention(dim * nfeats, heads=heads, dim_head=64, dropout=attn_dropout)), + ), + PreNorm( + dim * nfeats, Residual(FeedForward(dim * nfeats, mult=ffn_mult, dropout=ff_dropout)) + ), + ] + ) + ) + + def forward(self, x, mask_features=None, mask_samples=None): + """Transform the input embeddings tensor with Transformer module. + + Args: + x : torch.Tensor + 3-d tensor; embedded numeric and/or categorical values, + or the output of a previous Transformer layer. + mask_features: torch.Tensor + mask for the first attention + mask_samples: torch.Tensor + mask for the second attention + + Returns: + torch.Tensor + + """ + _, n, _ = x.shape + if self.style == "colrow": + for attn1, ff1, attn2, ff2 in self.layers: # type: ignore[code] + x = attn1(x, mask=mask_features) + x = ff1(x) + x = rearrange(x, "b n d -> 1 b (n d)") + x = attn2(x, mask=mask_samples) + x = ff2(x) + x = rearrange(x, "1 b (n d) -> b n d", n=n) + else: + for attn1, ff1 in self.layers: # type: ignore[code] + x = rearrange(x, "b n d -> 1 b (n d)") + x = attn1(x) + x = ff1(x) + x = rearrange(x, "1 b (n d) -> b n d", n=n) + return x + + +# transformer +class ColTransformer(nn.Module): + """Transformer Block. + + Args: + dim: Embeddings dimension. + depth: Number of Attention Blocks. + heads: Number of heads in Attention. + dim_head: Attention head dimension. + attn_dropout: Post-Attention dropout. + ff_dropout: Feed-Forward Dropout. + """ + + def __init__(self, dim, depth, heads, dim_head, attn_dropout, ff_dropout): + super().__init__() + self.layers = nn.ModuleList([]) + + for _ in range(depth): + self.layers.append( + nn.ModuleList( + [ + PreNorm(dim, Residual(Attention(dim, heads=heads, dim_head=dim_head, dropout=attn_dropout))), + PreNorm(dim, Residual(FeedForward(dim, dropout=ff_dropout))), + ] + ) + ) + + def forward(self, x, mask_features=None, mask_samples=None): + """Transform the input embeddings tensor with Transformer module. + + Args: + x : torch.Tensor + 3-d tensor; embedded numeric and/or categorical values, + or the output of a previous Transformer layer. + mask_features: torch.Tensor + not used + mask_samples: torch.Tensor + not used + + Returns: + torch.Tensor + + """ + for attn, ff in self.layers: + x = attn(x) + x = ff(x) + return x From 45c716657abb7f2922a91105a557246c2a06de47 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Mon, 16 Oct 2023 10:23:13 +0000 Subject: [PATCH 47/49] Descr --- lightautoml/text/trainer.py | 132 ------------------------------------ 1 file changed, 132 deletions(-) diff --git a/lightautoml/text/trainer.py b/lightautoml/text/trainer.py index dc127c1b..55be9cdb 100644 --- a/lightautoml/text/trainer.py +++ b/lightautoml/text/trainer.py @@ -570,138 +570,6 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: return loss_log - # def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: - # """Training loop. - - # Args: - # dataloaders: Dict with torch dataloaders. - - # Returns: - # Loss. - - # """ - # ################## - # loss_log = [] - # self.model.train() - # running_loss = 0 - # c = 0 - - # logging_level = get_stdout_level() - # if logging_level < logging.INFO and self.verbose and self.verbose_bar: - # loader = tqdm(dataloaders["train"], desc="train", disable=False) - # else: - # loader = dataloaders["train"] - - # for sample in loader: - # data = { - # i: (sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)) - # for i in sample.keys() - # } - - # loss = self.model(data).mean() - # if self.apex: - # with self.amp.scale_loss(loss, self.optimizer) as scaled_loss: - # scaled_loss.backward() - # else: - # loss.backward() - - # if self.clip_grad: - # torch.nn.utils.clip_grad_norm_(self.model.parameters(), **self.clip_grad_params) - # self.optimizer.step() - # self.optimizer.zero_grad() - - # loss = loss.data.cpu().numpy() - # loss_log.append(loss) - # running_loss += loss - - # c += 1 - # if self.verbose and self.verbose_bar and logging_level < logging.INFO: - # if self.verbose_inside and c % self.verbose_inside == 0: - # val_loss, val_data, weights = self.test(dataloader=dataloaders["val"]) - # if self.stop_by_metric: - # cond = -1 * self.metric(*val_data, weights) - # else: - # cond = np.mean(val_loss) - # self.se.update(self.model, cond) - - # logger.info3( - # "Epoch: {e}, iter: {c}, val loss: {vl}, val metric: {me}".format( - # me=self.metric(*val_data, weights), - # e=self.epoch, - # c=c, - # vl=np.mean(val_loss), - # ) - # ) - # loader.set_description("train (loss=%g)" % (running_loss / c)) - - # return loss_log - - # def test( - # self, dataloader: DataLoader, stage: str = "val", snap: bool = False - # ) -> Tuple[List[float], Tuple[np.ndarray, np.ndarray]]: - # """Testing loop. - - # Args: - # dataloader: Torch dataloader. - # stage: Train, val or test. - # snap: Use snapshots. - - # Returns: - # Loss, (Target, OOF). - - # """ - # ##################### - # loss_log = [] - # weights_log = [] - # self.model.eval() - # pred = [] - # target = [] - # logging_level = get_stdout_level() - # if logging_level < logging.INFO and self.verbose and self.verbose_bar: - # loader = tqdm(dataloader, desc=stage, disable=False) - # else: - # loader = dataloader - - # with torch.no_grad(): - # for sample in loader: - # data = { - # i: (sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)) - # for i in sample.keys() - # } - - # if snap: - # output = self.se.predict(data) - # loss = self.se.forward(data) if stage != "test" else None - # else: - # output = self.model.predict(data) - # loss = self.model(data) if stage != "test" else None - - # if stage != "test": - # loss = loss.mean().data.cpu().numpy() - - # loss_log.append(loss) - - # output = output.data.cpu().numpy() - # target_data = data["label"].data.cpu().numpy() - # weights = data.get("weight", None) - # if weights is not None: - # weights = weights.data.cpu().numpy() - - # pred.append(output) - # target.append(target_data) - # weights_log.extend(weights) - - # self.model.train() - - # return ( - # loss_log, - # ( - # np.vstack(target) if len(target[0].shape) == 2 else np.hstack(target), - # np.vstack(pred) if len(pred[0].shape) == 2 else np.hstack(pred), - # ), - # np.array(weights_log), - # ) - def test( self, dataloaders: DataLoader,stage: str = "val", snap: bool = False ) -> Tuple[List[float], Tuple[np.ndarray, np.ndarray]]: From 81e9db4a3651f86f94950e6280d7aed7be6f8c69 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Mon, 16 Oct 2023 10:25:57 +0000 Subject: [PATCH 48/49] Descr --- lightautoml/text/trainer.py | 50 +++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/lightautoml/text/trainer.py b/lightautoml/text/trainer.py index 55be9cdb..d8a3bf73 100644 --- a/lightautoml/text/trainer.py +++ b/lightautoml/text/trainer.py @@ -1,10 +1,9 @@ """Main pytorch training and prediction class with Snapshots Ensemble.""" -from itertools import cycle import logging from copy import deepcopy -from typing import Any, Iterable +from typing import Any from typing import Callable from typing import Dict from typing import List @@ -237,14 +236,21 @@ def load_state_dict(self, weights: Dict, model: nn.Module): return self + class InfIterator(object): + """Infinite Iterator. + + Args: + dataloader : torch.utils.dataloader + """ + def __init__(self, dataloader): self.dl = dataloader self.it = iter(self.dl) def __iter__(self): return self - + def __next__(self): try: return next(self.it) @@ -252,6 +258,7 @@ def __next__(self): self.it = iter(self.dl) return next(self.it) + class Trainer: """Torch main trainer class. @@ -305,7 +312,6 @@ def __init__( stop_by_metric: bool = False, clip_grad: bool = False, clip_grad_params: Optional[Dict] = None, - **kwargs ): self.net = net @@ -328,7 +334,7 @@ def __init__( self.stop_by_metric = stop_by_metric self.clip_grad = clip_grad self.clip_grad_params = clip_grad_params if clip_grad_params is not None else {} - + self.dataloader = None self.model = None self.optimizer = None @@ -452,7 +458,7 @@ def fit(self, dataloaders: Dict[str, DataLoader]) -> np.ndarray: train_loss = self.train(dataloaders=dataloaders) train_log.extend(train_loss) # test - + val_loss, val_data, weights = self.test(dataloaders=dataloaders) if self.stop_by_metric: cond = -1 * self.metric(*val_data, weights) @@ -496,7 +502,6 @@ def fit(self, dataloaders: Dict[str, DataLoader]) -> np.ndarray: return val_data[1] - def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: """Training loop. @@ -519,9 +524,9 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: else: loader = dataloaders["train"] sampler = None - if dataloaders['sampler'] is not None: + if dataloaders["sampler"] is not None: # data['batch_size'] = len(sample['label']) - sampler = InfIterator(dataloaders['sampler']) + sampler = InfIterator(dataloaders["sampler"]) for sample in loader: data = { i: (sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device)) @@ -531,10 +536,10 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: # if dataloaders['sampler'] is not None: # # data['batch_size'] = len(sample['label']) # data['sampler'] = dataloaders['sampler'] - data['sampler'] = sampler + data["sampler"] = sampler loss = self.model(data).mean() if self.apex: - with self.amp.scale_loss(loss, self .optimizer) as scaled_loss: + with self.amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() @@ -571,12 +576,12 @@ def train(self, dataloaders: Dict[str, DataLoader]) -> List[float]: return loss_log def test( - self, dataloaders: DataLoader,stage: str = "val", snap: bool = False + self, dataloaders: DataLoader, stage: str = "val", snap: bool = False ) -> Tuple[List[float], Tuple[np.ndarray, np.ndarray]]: """Testing loop. Args: - dataloader: Torch dataloader. + dataloaders: Torch dataloader. stage: Train, val or test. snap: Use snapshots. @@ -596,17 +601,17 @@ def test( else: loader = dataloaders[stage] sampler = None - if dataloaders['sampler'] is not None: + if dataloaders["sampler"] is not None: # data['batch_size'] = len(sample['label']) - sampler = InfIterator(dataloaders['sampler']) + sampler = InfIterator(dataloaders["sampler"]) with torch.no_grad(): for sample in loader: data = { i: sample[i].long().to(self.device) if _dtypes_mapping[i] == "long" else sample[i].to(self.device) for i in sample.keys() } - data['sampler'] = sampler - ### NOTE, HERE WE CAN ADD TORCH.UNIQUE + data["sampler"] = sampler + # NOTE, HERE WE CAN ADD TORCH.UNIQUE if snap: output = self.se.predict(data) loss = self.se.forward(data) if stage != "test" else None @@ -619,11 +624,11 @@ def test( loss_log.append(loss) - output = output.data.cpu().numpy()[:len(sample['label'])] - target_data = data["label"].data.cpu().numpy()[:len(sample['label'])] + output = output.data.cpu().numpy()[: len(sample["label"])] + target_data = data["label"].data.cpu().numpy()[: len(sample["label"])] weights = data.get("weight", None) - if weights is not None: - weights = weights.data.cpu().numpy()[:len(sample['label'])] + if weights is not None: + weights = weights.data.cpu().numpy()[: len(sample["label"])] pred.append(output) target.append(target_data) @@ -639,11 +644,12 @@ def test( ), np.array(weights_log), ) + def predict(self, dataloaders: DataLoader, stage: str) -> np.ndarray: """Predict model. Args: - dataloader: Torch dataloader. + dataloaders: Torch dataloader. stage: Train, val or test. Returns: From 2238f7e66d1641010f6ebec7c3ef81523fc15f66 Mon Sep 17 00:00:00 2001 From: Vasilev Dmitriy Date: Mon, 16 Oct 2023 11:37:06 +0000 Subject: [PATCH 49/49] added poolings --- lightautoml/ml_algo/torch_based/nn_models.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lightautoml/ml_algo/torch_based/nn_models.py b/lightautoml/ml_algo/torch_based/nn_models.py index c4dc20fa..0ee8c0b4 100644 --- a/lightautoml/ml_algo/torch_based/nn_models.py +++ b/lightautoml/ml_algo/torch_based/nn_models.py @@ -1237,6 +1237,7 @@ def __init__( ff_dropout=0.0, mlp_dropout=0.0, attentiontype="colrow", + pooling: str = "cls", device: torch.device = torch.device("cuda:0"), **kwargs, ): @@ -1268,9 +1269,9 @@ def __init__( l_rate = (n_in + 1) // 8 # input_size = (dim * self.num_categories) + (dim * num_continuous) hidden_dimensions = list(map(lambda t: l_rate * t, mlp_hidden_mults)) - + self.pooling = pooling_by_name[pooling]() self.mlp = MLP( - n_in=embedding_size, + n_in=embedding_size * 2 if pooling == "concat" else embedding_size, n_out=n_out, hidden_size=hidden_dimensions, drop_rate=mlp_dropout, @@ -1284,7 +1285,7 @@ def forward(self, embedded: torch.Tensor, bs: int) -> torch.Tensor: Args: embedded : torch.Tensor embedded fields - bs : batch size + bs : batch size without sapler`s part Returns: torch.Tensor @@ -1310,5 +1311,7 @@ def forward(self, embedded: torch.Tensor, bs: int) -> torch.Tensor: # cat_outs = self.mlp1(x[:,:self.num_categories,:]) # con_outs = self.mlp2(x[:,self.num_categories:,:]) # return cat_outs, con_outs - - return self.mlp(x[:, 0, :]) + x_mask = torch.ones(x.shape, dtype=torch.bool).to(self.device) + pool_tokens = self.pooling(x=x, x_mask=x_mask) + logits = self.mlp(pool_tokens) + return logits