diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index e88bd6720..000000000 --- a/LICENSE.txt +++ /dev/null @@ -1,12 +0,0 @@ -Copyright (c) 2015-2017, the tick developers -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/online_forest.py b/online_forest.py new file mode 100644 index 000000000..5b9cf25dc --- /dev/null +++ b/online_forest.py @@ -0,0 +1,265 @@ +from tick.simulation import SimuLogReg, weights_sparse_gauss +from sklearn.model_selection import train_test_split +import numpy as np +from tick.inference import OnlineForestClassifier +from matplotlib.colors import ListedColormap + +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.datasets import make_moons, make_classification, make_circles +from sklearn.metrics import roc_auc_score +import matplotlib.pyplot as plt + +from time import time + + +np.set_printoptions(precision=2) + +# w0 = weights_sparse_gauss(n_features, nnz=2) +# X, y = SimuLogReg(w0, -1., n_samples=n_samples, seed=seed).simulate() +# y = (y + 1) / 2 + + +def plot_decisions_regression(clfs, datasets, names): + i = 1 + h = .02 + fig = plt.figure(figsize=(4 * (len(clfs) + 1), 4 * len(datasets))) + # iterate over datasets + for ds_cnt, ds in enumerate(datasets): + X, y = ds + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + # just plot the dataset first + cm = plt.cm.RdBu + ax = plt.subplot(len(datasets), len(clfs) + 1, i) + if ds_cnt == 0: + ax.set_title("Input data") + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=25, cmap=cm) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=25, + alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + # iterate over classifiers + for name, clf in zip(names, clfs): + ax = plt.subplot(len(datasets), len(clfs) + 1, i) + clf.fit(X_train, y_train) + Z = clf.predict(np.array([xx.ravel(), yy.ravel()]).T) + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + # Plot also the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm, s=15) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, + s=15, alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + if ds_cnt == 0: + ax.set_title(name) + i += 1 + + plt.tight_layout() + # plt.show() + + +def plot_decision_classification(classifiers, datasets, names): + n_classifiers = len(classifiers) + n_datasets = len(datasets) + h = .02 + fig = plt.figure(figsize=(2 * (n_classifiers + 1), 2 * n_datasets)) + i = 1 + # iterate over datasets + for ds_cnt, ds in enumerate(datasets): + # preprocess dataset, split into training and test part + X, y = ds + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + # just plot the dataset first + cm = plt.cm.RdBu + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = plt.subplot(n_datasets, n_classifiers + 1, i) + if ds_cnt == 0: + ax.set_title("Input data") + # Plot the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=10, cmap=cm) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=10, + alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + # iterate over classifiers + for name, clf in zip(names, classifiers): + ax = plt.subplot(n_datasets, n_classifiers + 1, i) + if hasattr(clf, 'clear'): + clf.clear() + clf.fit(X_train, y_train) + Z = clf.predict_proba(np.array([xx.ravel(), yy.ravel()]).T)[:, 1] + + score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + if ds_cnt == 0: + ax.set_title(name) + ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 + + plt.tight_layout() + # plt.show() + + +path = '/Users/stephane.gaiffas/Downloads/' + +n_samples = 20000 +n_features = 100 +n_classes = 2 + +# +# of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=30., n_passes=1, +# seed=123, use_aggregation=True) +# +# of.fit(X, y) + +# print("n_nodes:", of.n_nodes()) +# print("n_leaves:", of.n_leaves()) +# print(of.predict_proba(X)) +# print("step: ", of._forest.step()) + +# of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=1., +# seed=123, use_aggregation=True, n_passes=1) +# +# of.fit(X, y) + +# print("n_nodes:", of.n_nodes()) +# print("n_leaves:", of.n_leaves()) +# print(of.predict_proba(X)) +# print("step: ", of._forest.step()) + + +# exit(0) + +X, y = make_classification(n_samples=n_samples, n_features=n_features, + n_redundant=0, n_informative=2, random_state=1, + n_clusters_per_class=1) +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) + + +clf = OnlineForestClassifier(n_classes=n_classes, n_trees=50, seed=123, + step=1., use_aggregation=True) + +X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + +clf.fit(X_train, y_train) + +# clf.predict(X_test) + +exit(0) + +# clf.print() + +X, y = make_classification(n_samples=n_samples, n_features=n_features, n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) +linearly_separable = (X, y) + +datasets = [ + make_moons(n_samples=n_samples, noise=0.3, random_state=0), + make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), + linearly_separable +] + +n_trees = 10 + +of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, step=30., n_passes=1, + seed=123, use_aggregation=True) +seed = 123 + + +params = [ + {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 0.1}, + {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 0.5}, + {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1., 'n_passes': 1, 'dirichlet': 2}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 1, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.2, 'n_passes': 5}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 10}, + # {'use_aggregation': True, 'n_trees': 1, 'subsampling': 1, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 1, 'subsampling': 0.1, 'n_passes': 10}, + # + # {'use_aggregation': True, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 20}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 1}, + # {'use_aggregation': True, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 20}, + # {'use_aggregation': False, 'n_trees': 1, 'subsampling': 1, 'n_passes': 1}, + # {'use_aggregation': False, 'n_trees': 1, 'subsampling': 1, 'n_passes': 20}, + # {'use_aggregation': False, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 1}, + # {'use_aggregation': False, 'n_trees': 5, 'subsampling': 0.2, 'n_passes': 20}, + # {'use_aggregation': False, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 1}, + # {'use_aggregation': False, 'n_trees': 50, 'subsampling': 0.1, 'n_passes': 20}, +] + + +def toto(kkk): + return "OF(T: " \ + + str(kkk['n_trees']) + ", S: " + str(kkk['subsampling']) \ + + ', P: ' + str(kkk['n_passes']) + ', di: ' + str(kkk['dirichlet']) \ + + ")" + # return "OF(A: " + str(kkk['use_aggregation']) + ", T: " \ + # + str(kkk['n_trees']) + ", S: " + str(kkk['subsampling']) \ + # + ', P: ' + str(kkk['n_passes']) + ")" + + +names = list(toto(kw) for kw in params) + ["KNN", "ET", "BRF"] + +classifiers = list( + OnlineForestClassifier(n_classes=n_classes, seed=123, step=1., **kw) + for kw in params +) + +classifiers += [ + KNeighborsClassifier(n_neighbors=5), + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) +] + +# names = [ +# "OF(agg, n_passes=1)", +# "OF(agg, n_passes=5)", +# "OF(agg, n_passes=10)", +# "OF(no agg., n_passes=1)", +# "KNN (k=5)", +# "ET", +# "BRF" +# ] + +plot_decision_classification(classifiers, datasets, names) + +# plt.savefig('decisions.pdf') + +plt.show() diff --git a/online_forest_agathe.py b/online_forest_agathe.py new file mode 100644 index 000000000..61759a106 --- /dev/null +++ b/online_forest_agathe.py @@ -0,0 +1,226 @@ +from tick.simulation import SimuLogReg, weights_sparse_gauss +from sklearn.model_selection import train_test_split +import numpy as np +from tick.inference import OnlineForestClassifier +from matplotlib.colors import ListedColormap + +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.datasets import make_moons, make_classification, make_circles +from sklearn.metrics import roc_auc_score +import matplotlib.pyplot as plt + +from time import time + + +n_samples = 1000 +n_features = 2 +seed = 123 + +np.set_printoptions(precision=2) + +w0 = weights_sparse_gauss(n_features, nnz=2) +X, y = SimuLogReg(w0, -1., n_samples=n_samples, seed=seed).simulate() +y = (y + 1) / 2 + + +def plot_decisions_regression(clfs, datasets, names): + i = 1 + h = .02 + fig = plt.figure(figsize=(4 * (len(clfs) + 1), 4 * len(datasets))) + # iterate over datasets + for ds_cnt, ds in enumerate(datasets): + X, y = ds + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + # just plot the dataset first + cm = plt.cm.RdBu + ax = plt.subplot(len(datasets), len(clfs) + 1, i) + if ds_cnt == 0: + ax.set_title("Input data") + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=25, cmap=cm) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=25, + alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + # iterate over classifiers + for name, clf in zip(names, clfs): + ax = plt.subplot(len(datasets), len(clfs) + 1, i) + clf.fit(X_train, y_train) + Z = clf.predict(np.array([xx.ravel(), yy.ravel()]).T) + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + # Plot also the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm, s=15) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, + s=15, alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + if ds_cnt == 0: + ax.set_title(name) + i += 1 + + plt.tight_layout() + # plt.show() + + +def plot_decision_classification(classifiers, datasets, names): + n_classifiers = len(classifiers) + n_datasets = len(datasets) + h = .02 + fig = plt.figure(figsize=(2 * (n_classifiers + 1), 2 * n_datasets)) + i = 1 + # iterate over datasets + for ds_cnt, ds in enumerate(datasets): + # preprocess dataset, split into training and test part + X, y = ds + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + # just plot the dataset first + cm = plt.cm.RdBu + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = plt.subplot(n_datasets, n_classifiers + 1, i) + if ds_cnt == 0: + ax.set_title("Input data") + # Plot the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=10, cmap=cm) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm, s=10, + alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + # iterate over classifiers + for name, clf in zip(names, classifiers): + ax = plt.subplot(n_datasets, n_classifiers + 1, i) + if hasattr(clf, 'clear'): + clf.clear() + clf.fit(X_train, y_train) + Z = clf.predict_proba(np.array([xx.ravel(), yy.ravel()]).T)[:, 1] + + score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + if ds_cnt == 0: + ax.set_title(name) + ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 + + plt.tight_layout() + # plt.show() + + +path = '/Users/stephane.gaiffas/Downloads/' + +n_trees = 30 + +X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) + + +# clf = OnlineForestClassifier(n_trees=n_trees, seed=123, step=1.) +# clf.fit(X, y) +# clf.print() + +linearly_separable = (X, y) + +X, y = make_moons(n_samples=n_samples, noise=0.3, random_state=0) + + +perm1 = np.arange(n_samples) +np.random.shuffle(perm1) +perm2 = np.arange(n_samples) +np.random.shuffle(perm2) +perm3 = np.arange(n_samples) +np.random.shuffle(perm3) + +perm4 = np.argsort(y) + +# datasets = [ +# make_moons(n_samples=n_samples, noise=0.3, random_state=0), +# make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1), +# linearly_separable +# ] + + +X2 = np.empty((50, 2)) + +X2[:, 0] = -1 + 0.5 * np.random.random(50) +X2[:, 1] = 0.7 + 0.5 * np.random.random(50) + + +y2 = np.ones(50) +X = np.vstack((X2, X)) +y = np.concatenate((y2, y)) + +# X, y + +datasets = [ + (X, y), + # (X[perm2, :], y[perm2]), + # (X[perm3, :], y[perm3]), + # (X[perm4, :], y[perm4]) +] + +# datasets = [ +# (X[perm1, :], y[perm1]), +# (X[perm2, :], y[perm2]), +# (X[perm3, :], y[perm3]), +# (X[perm4, :], y[perm4]) +# ] + +print(X[perm1]) + +from sklearn.neighbors import KNeighborsClassifier + + +classifiers = [ + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., use_aggregation=True, n_classes=2), + # OnlineForestClassifier(n_trees=n_trees, seed=123, step=100., use_aggregation=True), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., use_aggregation=False, n_classes=2), + KNeighborsClassifier(n_neighbors=5), + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) +] + +names = [ + "OF (agg, step=1.)", + # "OF(agg, step=100.)", + "OF(no agg.)", + "KNN (k=5)", + "ET", + "BRF" +] + +plot_decision_classification(classifiers, datasets, names) + +# plt.savefig('decisions.pdf') + +plt.show() diff --git a/online_forest_data.py b/online_forest_data.py new file mode 100644 index 000000000..c51a499ba --- /dev/null +++ b/online_forest_data.py @@ -0,0 +1,99 @@ + +import os +import pandas as pd +import pickle as pkl + +from tick.inference import OnlineForestRegressor, OnlineForestClassifier +from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, \ + RandomForestClassifier, ExtraTreesClassifier +from sklearn.neighbors import KNeighborsClassifier + +import matplotlib.pyplot as plt + + +# TODO: options for types of sampling of the features +# TODO: online construction of the feature_importances +# TODO: python script that tries all combinations + +# TODO: what if we feed several times the same dataset +# TODO: show that the classifier is insensitive to the time of arrival of the points +# TODO: V-fold instead of train and test ? +# TODO: Set features importance with default to none +# TODO: implement a subsample strategy : only one tree is updated with the given sample +# TODO: tree aggregation +# TODO: different "types" of trees: no aggregation, aggregation and different temperatures + +# TODO: unittest for attributes +# TODO: unittest for wrong n_features in fit and predict and wrong labels in training + +# TODO: tryout multiple passes +# TODO: really make seed work with inline forest + + +path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' + +filenames = [ + 'dna.p', + 'letter.p', + 'satimage.p', + 'usps.p' +] + +n_classess = [3, 25, 5, 9] + +n_trees = 10 + +names = [ + "OF (agg, step=1.)", + "OF(agg, step=100.)", + "OF(no agg.)", + "KNN (k=5)", + "ET", + "BRF" +] + + +for filename, n_classes in zip(filenames, n_classess): + print(filename) + with open(os.path.join(path, filename), 'rb') as f: + data = pkl.load(f) + X_train = data['x_train'] + X_test = data['x_test'] + y_train = data['y_train'] + y_test = data['y_test'] + + classifiers = [ + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., + use_aggregation=True, n_classes=n_classes), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=100., + n_classes=n_classes, use_aggregation=True), + OnlineForestClassifier(n_trees=n_trees, seed=123, step=1., + use_aggregation=False, n_classes=n_classes), + KNeighborsClassifier(n_neighbors=5), + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) + ] + + triche = RandomForestClassifier(n_estimators=n_trees) + triche.fit(X_train, y_train) + feature_importances = triche.feature_importances_ / triche.feature_importances_.sum() + # + # plt.stem(probabilities) + # plt.title('Features importance for ' + filename, fontsize=18) + # plt.xlabel('Features') + # plt.ylabel('Importance') + # # plt.show() + # plt.savefig(filename + '.pdf') + + # online_forest.set_probabilities(probabilities) + + # forest1 = + + for clf, name in zip(classifiers, names): + if hasattr(clf, 'clear'): + clf.clear() + clf.set_feature_importances(feature_importances) + # print('Fitting', name) + clf.fit(X_train, y_train) + # print('Done.') + print('Accuracy of', name, ': ', '%.2f' % clf.score(X_test, y_test)) diff --git a/online_forest_datasets.py b/online_forest_datasets.py new file mode 100644 index 000000000..af21ee27a --- /dev/null +++ b/online_forest_datasets.py @@ -0,0 +1,394 @@ + +import os +import pandas as pd +import numpy as np +from time import time +import zipfile + +from sklearn.preprocessing import MinMaxScaler +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.metrics import roc_auc_score +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split + +from tick.inference import OnlineForestClassifier + + +path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/' + +# TODO: do for dna.p + + + +def read_abalone(path): + archive = zipfile.ZipFile(os.path.join(path, 'abalone.csv.zip'), 'r') + with archive.open('abalone.csv') as f: + data = pd.read_csv(f, header=None) + continuous = list(range(1, 8)) + discrete = [0] + y = data.pop(8) + y -= 1 + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + y = y.as_matrix() + X = np.hstack((X_continuous, X_discrete)) + return X, y, 'abalone' + + +def read_adult(path): + archive = zipfile.ZipFile(os.path.join(path, 'adult.csv.zip'), 'r') + with archive.open('adult.csv') as f: + data = pd.read_csv(f, header=None) + y = data.pop(13) + discrete = [1, 3, 4, 5, 6, 7, 8, 12] + continuous = list(set(range(13)) - set(discrete)) + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + y = pd.get_dummies(y).as_matrix()[:, 1] + X = np.hstack((X_continuous, X_discrete)) + return X, y, 'adult' + + +def read_bank(path): + archive = zipfile.ZipFile(os.path.join(path, 'bank.csv.zip'), 'r') + with archive.open('bank.csv') as f: + data = pd.read_csv(f) + y = data.pop('y') + discrete = ['job', 'marital', 'education', 'default', 'housing', + 'loan', 'contact', 'day', 'month', 'campaign', 'poutcome'] + continuous = ['age', 'balance', 'duration', 'pdays', 'previous'] + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + y = pd.get_dummies(y).as_matrix()[:, 1] + X = np.hstack((X_continuous, X_discrete)) + return X, y, 'bank' + + +def read_car(path): + archive = zipfile.ZipFile(os.path.join(path, 'car.csv.zip'), 'r') + with archive.open('car.csv') as f: + data = pd.read_csv(f, header=None) + y = data.pop(6) + y = np.argmax(pd.get_dummies(y).as_matrix(), axis=1) + X = pd.get_dummies(data, prefix_sep='#').as_matrix() + return X, y, 'car' + + +def read_cardio(path): + archive = zipfile.ZipFile(os.path.join(path, 'cardiotocography.csv.zip'), + 'r') + with archive.open('cardiotocography.csv', ) as f: + data = pd.read_csv(f, sep=';', decimal=',') + + data.drop(['FileName', 'Date', 'SegFile', + 'A', 'B', 'C', 'D', 'E', 'AD', 'DE', + 'LD', 'FS', 'SUSP'], axis=1, inplace=True) + # A 10-class label + y_class = data.pop('CLASS').as_matrix() + y_class -= 1 + # A 3-class label + y_nsp = data.pop('NSP').as_matrix() + y_nsp -= 1 + continuous = [ + 'b', 'e', 'LBE', 'LB', 'AC', 'FM', 'UC', + 'ASTV', 'MSTV', 'ALTV', 'MLTV', + 'DL', 'DS', 'DP', + 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', + 'Mean', 'Median', 'Variance' + ] + + discrete = [ + 'Tendency' + ] + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + X = np.hstack((X_continuous, X_discrete)) + return X, y_nsp, 'cardio' + + +def read_churn(path): + archive = zipfile.ZipFile(os.path.join(path, 'churn.csv.zip'), 'r') + with archive.open('churn.csv') as f: + data = pd.read_csv(f) + y = data.pop('Churn?') + discrete = [ + 'State', 'Area Code', "Int'l Plan", 'VMail Plan', ] + + continuous = [ + 'Account Length', 'Day Mins', 'Day Calls', 'Eve Calls', 'Day Charge', + 'Eve Mins', 'Eve Charge', 'Night Mins', 'Night Calls', + 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', + 'CustServ Calls', 'VMail Message' + ] + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + y = pd.get_dummies(y).as_matrix()[:, 1] + X = np.hstack((X_continuous, X_discrete)) + return X, y, 'churn' + + +def read_default_cb(path): + archive = zipfile.ZipFile(os.path.join(path, 'default_cb.csv.zip'), 'r') + with archive.open('default_cb.csv') as f: + data = pd.read_csv(f) + continuous = [ + 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'LIMIT_BAL', + 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', + 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6' + ] + discrete = [ + 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', + 'SEX', 'EDUCATION', 'MARRIAGE' + ] + _ = data.pop('ID') + y = data.pop('default payment next month') + X_continuous = MinMaxScaler().fit_transform(data[continuous]) + data_discrete = pd.get_dummies(data[discrete], prefix_sep='#') + X_discrete = data_discrete.as_matrix() + y = pd.get_dummies(y).as_matrix()[:, 1] + X = np.hstack((X_continuous, X_discrete)) + return X, y, 'default_cb' + + +def read_ijcnn1(path): + archive = zipfile.ZipFile(os.path.join(path, 'ijcnn1.csv.zip'), 'r') + with archive.open('ijcnn1.csv', ) as f: + data = pd.read_csv(f, header=None) + y = data.pop(12).as_matrix() + y = (y + 1) / 2 + X = data.as_matrix() + return X, y, 'ijcnn1' + + +def read_isolet(path): + archive = zipfile.ZipFile(os.path.join(path, 'isolet.zip'), 'r') + with archive.open('isolet/isolet1234.csv') as f: + data1 = pd.read_csv(f, header=None) + with archive.open('isolet/isolet5.csv') as f: + data2 = pd.read_csv(f, header=None) + data = pd.concat((data1, data2)) + y = data.pop(617).as_matrix() + y -= 1 + X = data.as_matrix() + return X, y, 'isolet' + + +def read_letter(path): + archive = zipfile.ZipFile(os.path.join(path, 'letter.csv.zip'), 'r') + with archive.open('letter.csv') as f: + data = pd.read_csv(f) + data.drop(['Unnamed: 0'], axis=1, inplace=True) + y = data.pop('y').as_matrix() + X = data.as_matrix() + return X, y, 'letter' + + +def read_nursery(path): + archive = zipfile.ZipFile(os.path.join(path, 'nursery.csv.zip'), 'r') + with archive.open('nursery.csv') as f: + data = pd.read_csv(f, header=None) + y1 = data.pop(7) + y1 = pd.get_dummies(y1) + y1 = y1.as_matrix().argmax(axis=1) + y2 = data.pop(8) + y2 = pd.get_dummies(y2) + y2 = y2.as_matrix().argmax(axis=1) + X = pd.get_dummies(data, prefix_sep='#').as_matrix() + return X, y2, 'nursery' + + +def read_ozone(path): + archive = zipfile.ZipFile(os.path.join(path, 'ozone.zip'), 'r') + with archive.open('ozone/ozone.eighthr.csv') as f: + data = pd.read_csv(f, header=None, na_values='?') + data.dropna(inplace=True) + data.drop([0], axis=1, inplace=True) + y = data.pop(73).as_matrix() + X = data.as_matrix() + return X, y, 'ozone' + + +def read_satimage(path): + archive = zipfile.ZipFile(os.path.join(path, 'satimage.csv.zip'), 'r') + with archive.open('satimage.csv') as f: + data = pd.read_csv(f) + data.drop(['Unnamed: 0'], axis=1, inplace=True) + y = data.pop('y').as_matrix() + X = data.as_matrix() + return X, y, 'satimage' + + +def read_sensorless(path): + archive = zipfile.ZipFile(os.path.join(path, 'sensorless.csv.zip'), 'r') + with archive.open('sensorless.csv') as f: + data = pd.read_csv(f, sep=' ', header=None) + y = data.pop(48).as_matrix() + y -= 1 + X = MinMaxScaler().fit_transform(data) + return X, y, 'sensorless' + + +def read_shuttle(path): + archive = zipfile.ZipFile(os.path.join(path, 'shuttle.csv.zip'), 'r') + with archive.open('shuttle.csv') as f: + data = pd.read_csv(f, header=None) + + y = data.pop(10).as_matrix() + y -= 1 + X = MinMaxScaler().fit_transform(data) + return X, y, 'shuttle' + + +def read_spambase(path): + archive = zipfile.ZipFile(os.path.join(path, 'spambase.csv.zip'), 'r') + with archive.open('spambase.csv') as f: + data = pd.read_csv(f, header=None) + y = data.pop(57).as_matrix() + X = MinMaxScaler().fit_transform(data) + return X, y, 'spambase' + + +def read_usps(path): + archive = zipfile.ZipFile(os.path.join(path, 'usps.csv.zip'), 'r') + with archive.open('usps.csv') as f: + data = pd.read_csv(f) + data.drop(['Unnamed: 0'], axis=1, inplace=True) + y = data.pop('y').as_matrix() + X = data.as_matrix() + return X, y, 'usps' + + +def read_wilt(path): + archive = zipfile.ZipFile(os.path.join(path, 'wilt.csv.zip'), 'r') + with archive.open('wilt.csv') as f: + data = pd.read_csv(f) + y = data.pop('class') + y = pd.get_dummies(y).as_matrix().argmax(axis=1) + X = MinMaxScaler().fit_transform(data) + return X, y, 'wilt' + + +def read_wine(path): + archive = zipfile.ZipFile(os.path.join(path, 'wine.zip'), 'r') + with archive.open('wine/red.csv') as f: + data_red = pd.read_csv(f, sep=';') + with archive.open('wine/white.csv') as f: + data_white = pd.read_csv(f, sep=";") + data = data_red + y = data.pop('quality').as_matrix() + y -= 3 + X = MinMaxScaler().fit_transform(data) + return X, y, 'wine' + + +readers = [ + read_abalone, + read_adult + read_bank + read_car, + read_cardio, + read_churn, + read_default_cb, + read_ijcnn1 + read_isolet, + read_letter, + read_nursery, + read_ozone, + read_satimage, + read_sensorless, + read_shuttle, + read_spambase, + read_usps, + read_wilt, + read_wine +] + +n_trees = 10 + +names = [ + "OF (agg, step=1.)", + "OF(agg, step=100.)", + "OF(no agg.)", + "KNN (k=5)", + "LR", + "ET", + "BRF" +] + + +data_description = pd.DataFrame( + columns=['name', '#samples', '#features', '#classes'] +) + +performances = pd.DataFrame( + columns=['dataset'] + names +) + +timings = pd.DataFrame( + columns=['dataset'] + names +) + + +for reader in readers: + # Read the data + X, y, dataset_name = reader(path) + X_train, X_test, y_train, y_test \ + = train_test_split(X, y, test_size=.3, random_state=42) + + n_samples, n_features = X.shape + n_classes = int(y.max() + 1) + + data_description = data_description.append( + pd.DataFrame([[dataset_name, n_samples, n_features, n_classes]], + columns=data_description.columns) + ) + + classifiers = [ + OnlineForestClassifier(n_classes=n_classes, n_trees=n_trees, seed=123, step=1., + use_aggregation=True), + OnlineForestClassifier(n_classes=n_classes, n_trees=n_trees, seed=123, + step=100., + use_aggregation=True), + OnlineForestClassifier(n_classes=n_classes, n_trees=n_trees, seed=123, step=1., + use_aggregation=False), + KNeighborsClassifier(n_neighbors=5), + LogisticRegression(class_weight='balanced'), + ExtraTreesClassifier(n_estimators=n_trees), + RandomForestClassifier(n_estimators=n_trees) + ] + + performance = [dataset_name] + timing = [dataset_name] + + for clf, name in zip(classifiers, names): + if hasattr(clf, 'clear'): + clf.clear() + t1 = time() + clf.fit(X_train, y_train) + t2 = time() + score = clf.score(X_test, y_test) + t = t2 - t1 + timing.append(t) + performance.append(score) + print('Accuracy of', name, ': ', + '%.2f' % score, + "in %.2f (s)" % t) + + performances = performances.append( + pd.DataFrame([performance], columns=performances.columns) + ) + timings = timings.append( + pd.DataFrame([timing], columns=timings.columns) + ) + +print(data_description) + +print(performances) + +print(timings) diff --git a/online_forest_selection.py b/online_forest_selection.py new file mode 100644 index 000000000..3ab4ab6d4 --- /dev/null +++ b/online_forest_selection.py @@ -0,0 +1,53 @@ + +from sklearn.model_selection import train_test_split +from sklearn.metrics import roc_auc_score +import numpy as np +from tick.inference import OnlineForestClassifier +from tick.simulation import weights_sparse_exp, SimuLogReg +from sklearn.ensemble import RandomForestClassifier + +np.set_printoptions(precision=2) + +n_samples = 30000 +n_features = 30 +n_classes = 2 + +nnz = 5 +w0 = np.zeros(n_features) +w0[:nnz] = 1 + +# w0 = weights_sparse_exp(n_features, nnz=nnz) + +X, y = SimuLogReg(weights=w0, intercept=None, n_samples=n_samples, + cov_corr=0.1).simulate() +y = (y + 1) / 2 + +X_train, X_test, y_train, y_test = train_test_split(X, y) + +rf = RandomForestClassifier(n_estimators=10, criterion="entropy") +of = OnlineForestClassifier(n_classes=n_classes, n_trees=10, seed=123, + step=1., use_aggregation=True) + +print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) + +rf.fit(X_train, y_train) +of.fit(X_train, y_train) + +y_pred_rf = rf.predict_proba(X_test)[:, 1] +y_pred_of = of.predict_proba(X_test)[:, 1] + +print("AUC rf=", roc_auc_score(y_test, y_pred_rf)) +print("AUC of=", roc_auc_score(y_test, y_pred_of)) + + +import matplotlib.pyplot as plt + +plt.subplot(1, 3, 1) +plt.stem(rf.feature_importances_) +plt.subplot(1, 3, 2) +plt.stem(of.feature_importances) +plt.subplot(1, 3, 3) +plt.stem(w0) + +plt.show() +# print(clf.feature_importances) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index c94cc223f..000000000 --- a/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -numpy -numpydoc -scipy -matplotlib -scikit-learn -pandas diff --git a/setup.py b/setup.py index c811c7d80..f61e97157 100644 --- a/setup.py +++ b/setup.py @@ -655,10 +655,14 @@ def add_dir_name(dir_name, filenames): inference_extension_info = { "cpp_files": ["hawkes_conditional_law.cpp", "hawkes_em.cpp", "hawkes_adm4.cpp", "hawkes_basis_kernels.cpp", - "hawkes_sumgaussians.cpp"], + "hawkes_sumgaussians.cpp", + "online_forest_regressor.cpp", + "online_forest_classifier.cpp"], "h_files": ["hawkes_conditional_law.h", "hawkes_em.h", "hawkes_adm4.h", "hawkes_basis_kernels.h", - "hawkes_sumgaussians.h"], + "hawkes_sumgaussians.h", + "online_forest_regressor.h", + "online_forest_classifier.h"], "swig_files": ["inference_module.i"], "module_dir": "./tick/inference/", "extension_name": "inference", diff --git a/tick/inference/__init__.py b/tick/inference/__init__.py index f8b0b5ca5..5e3f06e96 100644 --- a/tick/inference/__init__.py +++ b/tick/inference/__init__.py @@ -17,6 +17,9 @@ from .survival import kaplan_meier, nelson_aalen from .robust import std_iqr, std_mad +from .online_forest_regressor import OnlineForestRegressor +from .online_forest_classifier import OnlineForestClassifier + __all__ = [ "LinearRegression", "RobustLinearRegression", @@ -29,7 +32,9 @@ "HawkesEM", "HawkesADM4", "HawkesBasisKernels", - "HawkesSumGaussians," + "HawkesSumGaussians", + "OnlineForestRegressor", + "OnlineForestClassifier", "kaplan_meier", "nelson_aalen" ] diff --git a/tick/inference/online_forest_classifier.py b/tick/inference/online_forest_classifier.py new file mode 100644 index 000000000..25b9be29f --- /dev/null +++ b/tick/inference/online_forest_classifier.py @@ -0,0 +1,208 @@ +# License: BSD 3 clause + +from abc import ABC + +import numpy as np + +from tick.base import Base +from tick.base import actual_kwargs +from tick.preprocessing.utils import safe_array + +from .build.inference import OnlineForestClassifier as _OnlineForestClassifier +from .build.inference import CriterionClassifier_log as log + + +class OnlineForestClassifier(ABC, Base): + """Truly online random forest for regression (continuous labels). BLABLA + + TODO: update docstrings + + Parameters + ---------- + n_classes : `int` + Number of classes, we need this information since in a online setting, + we don't know the number of classes in advance. + + n_trees : `int`, default=10 + Number of trees to grow in the forest. Cannot be changed after the first + call to ``fit``. + + step : `float`, default=1. + Step-size for the aggregation weights. Default is 1 for classification. + + criterion : {'log'}, default='log' + The criterion used to selected a split. Supported criteria are: + * 'unif': splits are sampled uniformly in the range of the features, and + the feature to be splitted is chosen uniformly at random + * 'mse': the split and feature leading to the best variance reduction + is selected + This cannot be changed after the first call to ``fit`` + + use_aggregation : `bool`, default=True + If True + + n_threads : `int`, default=1 + The number of threads used to grow trees in parallel during training. + If n_threads < 0, then all available cores will be used. + + seed : `int`, default=-1 + If seed >= 0, this is used to seed the random number generators of the + forest. + + verbose : `bool`, default=True + If True, then verboses things during training + + Attributes + ---------- + n_samples : `int` + Number of samples seen during training + + n_features : int + The number of features from the training dataset (passed to ``fit``) + """ + + _attrinfos = { + '_actual_kwargs': {'writable': False}, + '_fitted': {'writable': False}, + '_forest': {'writable': False}, + '_criterion': {'writable': False, 'cpp_setter': 'set_criterion'}, + 'n_trees': {'writable': True, 'cpp_setter': 'set_n_trees'}, + 'n_threads': {'writable': True, 'cpp_setter': 'set_n_threads'}, + 'seed': {'writable': True, 'cpp_setter': 'set_seed'}, + 'verbose': {'writable': True, 'cpp_setter': 'set_verbose'}, + 'warm_start': {'writable': True, 'cpp_setter': 'set_warm_start'}, + 'n_splits': {'writable': True, 'cpp_setter': 'set_n_splits'}, + 'dirichlet': {'writable': True, 'cpp_setter': 'set_dirichlet'} + } + + _cpp_obj_name = "_forest" + + # TODO: n_classes must be mandatory + + @actual_kwargs + def __init__(self, n_classes: int, n_trees: int = 10, n_passes: int = 1, + step: float = 1., + criterion: str = 'log', use_aggregation: bool = True, + subsampling: float=1., dirichlet: float=None, + n_threads: int = 1, + seed: int = -1, verbose: bool = True): + Base.__init__(self) + if not hasattr(self, "_actual_kwargs"): + self._actual_kwargs = {} + self._fitted = False + self.n_trees = n_trees + self.n_passes = n_passes + self.n_features = None + self.n_classes = n_classes + self.step = step + self.criterion = criterion + self.n_threads = n_threads + self.seed = seed + self.verbose = verbose + self.use_aggregation = use_aggregation + self.subsampling = subsampling + if dirichlet is None: + dirichlet = 1 / n_classes + self.dirichlet = dirichlet + self._forest = None + + def set_data(self, X, y): + X = safe_array(X) + y = safe_array(y) + self._forest.set_data(X, y) + + def fit(self, X, y): + X = safe_array(X) + y = safe_array(y) + n_samples, n_features = X.shape + # TODO: check that sizes of X and y match + if self._forest is None: + self.n_features = n_features + _forest = _OnlineForestClassifier( + n_features, self.n_classes, self.n_trees, self.n_passes, + self.step, + self._criterion, self.use_aggregation, self.subsampling, + self.dirichlet, self.n_threads, self.seed, self.verbose + ) + self._set('_forest', _forest) + self._set("_fitted", True) + self._forest.fit(X, y) + return self + + def apply(self, X): + """Make the samples from X follow the trees from the forest, and return + the indices of the leaves + """ + raise NotImplementedError() + + def predict_proba(self, X): + """Predict class for given samples + + Parameters + ---------- + X : `np.ndarray` or `scipy.sparse.csr_matrix`, shape=(n_samples, n_features) + Features matrix to predict for. + + Returns + ------- + output : `np.ndarray`, shape=(n_samples, n_classes) + Returns predicted values. + """ + import numpy as np + scores = np.empty((X.shape[0], self.n_classes)) + if not self._fitted: + raise RuntimeError("You must call ``fit`` before") + else: + X = safe_array(X) + self._forest.predict(X, scores) + return scores + + def predict(self, X): + if not self._fitted: + raise RuntimeError("You must call ``fit`` before") + else: + scores = self.predict_proba(X) + return scores.argmax(axis=1) + + def clear(self): + self._forest.clear() + + def score(self, X, y): + from sklearn.metrics import accuracy_score + y_pred = self.predict(X) + return accuracy_score(y, y_pred) + + def print(self): + self._forest._print() + # TODO: property for splits + + def n_leaves(self): + n_leaves_per_tree = np.empty(self.n_trees, dtype=np.uint32) + self._forest.n_leaves(n_leaves_per_tree) + return n_leaves_per_tree + + def n_nodes(self): + n_nodes_per_tree = np.empty(self.n_trees, dtype=np.uint32) + self._forest.n_nodes(n_nodes_per_tree) + return n_nodes_per_tree + + @property + def criterion(self): + if self._criterion == log: + return 'log' + + @criterion.setter + def criterion(self, value): + if value == 'log': + self._set('_criterion', log) + else: + raise ValueError("``criterion`` must be either 'unif' or 'mse'.") + + def set_feature_importances(self, feature_importances): + self._forest.set_feature_importances(feature_importances) + + @property + def feature_importances(self): + feature_importances = np.empty(self.n_features) + self._forest.get_feature_importances(feature_importances) + return feature_importances diff --git a/tick/inference/online_forest_regressor.py b/tick/inference/online_forest_regressor.py new file mode 100644 index 000000000..1a5b28ba2 --- /dev/null +++ b/tick/inference/online_forest_regressor.py @@ -0,0 +1,178 @@ +# License: BSD 3 clause + +from abc import ABC + +from tick.base import Base +from tick.base import actual_kwargs + +from .build.inference import OnlineForestRegressor as _OnlineForestRegressor +from tick.preprocessing.utils import safe_array + +from .build.inference import CriterionRegressor_unif as unif +from .build.inference import CriterionRegressor_mse as mse + + +class OnlineForestRegressor(ABC, Base): + """Truly online random forest for regression (continuous labels). BLABLA + + Parameters + ---------- + n_trees : `int`, default=10 + Number of trees to grow in the forest. Cannot be changed after the first + call to ``fit``. + + criterion : {'unif', 'mse'}, default='unif' + The criterion used to selected a split. Supported criteria are: + * 'unif': splits are sampled uniformly in the range of the features, and + the feature to be splitted is chosen uniformly at random + * 'mse': the split and feature leading to the best variance reduction + is selected + This cannot be changed after the first call to ``fit`` + + max_depth : `int`, default=-1 + The maximum depth of a tree. If <= 0, nodes are splitted with no limit + on the depth of the tree + + min_samples_split : `int`, default=50 + A node waits to contain `min_samples_split` before splitting. + + n_threads : `int`, default=1 + The number of threads used to grow trees in parallel during training. + If n_threads < 0, then all available cores will be used. + + seed : `int`, default=-1 + If seed >= 0, this is used to seed the random number generators of the + forest. + + verbose : `bool`, default=True + If True, then verboses things during training + + warm_start : `bool`, default=True + If True, then successive calls to ``fit`` will continue to grow existing + trees. Otherwise, we start from empty trees + + n_splits : `int`, default=10 + Number of potential splits to consider for a feature. BLABLA ??? + + Attributes + ---------- + n_samples : `int` + Number of samples seen during training + + n_features : int + The number of features from the training dataset (passed to ``fit``) + """ + + _attrinfos = { + '_actual_kwargs': {'writable': False}, + '_fitted': {'writable': False}, + '_forest': {'writable': False}, + '_criterion': {'writable': False, 'cpp_setter': 'set_criterion'}, + 'n_trees': {'writable': True, 'cpp_setter': 'set_n_trees'}, + 'max_depth': {'writable': True, 'cpp_setter': 'set_max_depth'}, + 'min_samples_split': {'writable': True, + 'cpp_setter': 'set_min_samples_split'}, + 'n_threads': {'writable': True, 'cpp_setter': 'set_n_threads'}, + 'seed': {'writable': True, 'cpp_setter': 'set_seed'}, + 'verbose': {'writable': True, 'cpp_setter': 'set_verbose'}, + 'warm_start': {'writable': True, 'cpp_setter': 'set_warm_start'}, + 'n_splits': {'writable': True, 'cpp_setter': 'set_n_splits'}, + } + + _cpp_obj_name = "_forest" + + @actual_kwargs + def __init__(self, n_trees: int = 10, step: float = 1., + criterion: str = 'unif', + max_depth: int = -1, min_samples_split: int = 50, + n_threads: int = 1, seed: int = -1, verbose: bool = True, + warm_start: bool = True, n_splits: int = 10): + Base.__init__(self) + if not hasattr(self, "_actual_kwargs"): + self._actual_kwargs = {} + self._fitted = False + self.n_trees = n_trees + self.step = step + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.n_threads = n_threads + self.seed = seed + self.verbose = verbose + self.warm_start = warm_start + self.n_splits = n_splits + self._forest = _OnlineForestRegressor(n_trees, + step, + self._criterion, + #max_depth, + # min_samples_split, + n_threads, + seed, + verbose) + #warm_start, n_splits) + + def set_data(self, X, y): + X = safe_array(X) + y = safe_array(y) + self._forest.set_data(X, y) + + def fit(self, X, y): + X = safe_array(X) + y = safe_array(y) + self._set("_fitted", True) + self._forest.fit(X, y) + return self + + def apply(self, X): + """Make the samples from X follow the trees from the forest, and return + the indices of the leaves + """ + raise NotImplementedError() + + def predict(self, X, use_aggregation: bool=True): + """Predict class for given samples + + Parameters + ---------- + X : `np.ndarray` or `scipy.sparse.csr_matrix`, shape=(n_samples, n_features) + Features matrix to predict for. + + Returns + ------- + output : `np.array`, shape=(n_samples,) + Returns predicted values. + """ + import numpy as np + y_pred = np.empty(X.shape[0]) + if not self._fitted: + raise ValueError("You must call ``fit`` before") + else: + X = safe_array(X) + self._forest.predict(X, y_pred, True) + return y_pred + + def score(self, X, y): + from sklearn.metrics import r2_score + + def print(self): + self._forest._print() + + # TODO: property for splits + + @property + def criterion(self): + if self._criterion == unif: + return 'unif' + else: + return 'mse' + + @criterion.setter + def criterion(self, value): + if value == 'unif': + self._set('_criterion', unif) + # self._forest.set_criterion(unif) + elif value == 'mse': + self._set('_criterion', mse) + # self._forest.set_criterion(mse) + else: + raise ValueError("``criterion`` must be either 'unif' or 'mse'.") diff --git a/tick/inference/src/CMakeLists.txt b/tick/inference/src/CMakeLists.txt index acbb54d80..81d9f38d9 100644 --- a/tick/inference/src/CMakeLists.txt +++ b/tick/inference/src/CMakeLists.txt @@ -3,7 +3,9 @@ add_library(tick_inference EXCLUDE_FROM_ALL hawkes_em.cpp hawkes_em.h hawkes_adm4.h hawkes_adm4.cpp hawkes_basis_kernels.cpp hawkes_basis_kernels.h - hawkes_sumgaussians.h hawkes_sumgaussians.cpp) + hawkes_sumgaussians.h hawkes_sumgaussians.cpp + online_forest_regressor.h online_forest_regressor.cpp + online_forest_classifier.h online_forest_classifier.cpp) target_link_libraries(tick_inference diff --git a/tick/inference/src/online_forest_classifier.cpp b/tick/inference/src/online_forest_classifier.cpp new file mode 100644 index 000000000..b75a5d497 --- /dev/null +++ b/tick/inference/src/online_forest_classifier.cpp @@ -0,0 +1,878 @@ + +// License: BSD 3 clause + +#include "online_forest_classifier.h" + +/********************************************************************************* + * NodeClassifier methods + *********************************************************************************/ + +NodeClassifier::NodeClassifier(TreeClassifier &tree, uint32_t parent, double time) + : _tree(tree) { + _parent = parent; + _time = time; + _left = 0; + _right = 0; + _n_samples = 0; + _is_leaf = true; + _weight = 0; + _weight_tree = 0; + _counts = ArrayULong(n_classes()); + _counts.fill(0); +} + +NodeClassifier::NodeClassifier(const NodeClassifier &node) + : _tree(node._tree), + _parent(node._parent), _left(node._left), _right(node._right), + _feature(node._feature), _threshold(node._threshold), + _time(node._time), _features_min(node._features_min), _features_max(node._features_max), + _n_samples(node._n_samples), + _x_t(node._x_t), + _y_t(node._y_t), + _weight(node._weight), _weight_tree(node._weight_tree), + _is_leaf(node._is_leaf), + _counts(node._counts) {} + +NodeClassifier::NodeClassifier(const NodeClassifier &&node) : _tree(_tree) { + _parent = node._parent; + _left = node._left; + _right = node._right; + _feature = node._feature; + _threshold = node._threshold; + _time = node._time; + _features_min = node._features_min; + _features_max = node._features_max; + _n_samples = node._n_samples; + _x_t = node._x_t; + _y_t = node._y_t; + _weight = node._weight; + _weight_tree = node._weight_tree; + _is_leaf = node._is_leaf; + _counts = node._counts; +} + +NodeClassifier &NodeClassifier::operator=(const NodeClassifier &node) { + _parent = node._parent; + _left = node._left; + _right = node._right; + _feature = node._feature; + _threshold = node._threshold; + _time = node._time; + _features_min = node._features_min; + _features_max = node._features_max; + _n_samples = node._n_samples; + _x_t = node._x_t; + _y_t = node._y_t; + _weight = node._weight; + _weight_tree = node._weight_tree; + _is_leaf = node._is_leaf; + _counts = node._counts; + return *this; +} + +double NodeClassifier::update_downwards(const ArrayDouble &x_t, const double y_t) { + _n_samples++; + // double loss_t = loss(y_t); + if (use_aggregation()) { + _weight -= step() * loss(y_t); + } + update_predict(y_t); + // We return the loss before updating the predictor of the node in order to + // update the feature importance in TreeClassifier::go_downwards + return loss(y_t); +} + +bool NodeClassifier::is_same(const ArrayDouble &x_t) { + if (_is_leaf) { + for (uint32_t j = 0; j < n_features(); ++j) { + double delta = std::abs(x_t[j] - _x_t[j]); + if (delta > 0.) { + return false; + } + } + return true; + } else { + TICK_ERROR("NodeClassifier::is_same: node is not a leaf !") + } +} + +void NodeClassifier::update_upwards() { + if (_is_leaf) { + _weight_tree = _weight; + } else { + _weight_tree = log_sum_2_exp(_weight, node(_left).weight_tree() + node(_right).weight_tree()); + } +} + +void NodeClassifier::update_predict(const double y_t) { + // We update the counts for the class y_t + _counts[static_cast(y_t)]++; +} + +void NodeClassifier::update_range(const ArrayDouble &x_t) { + if (_n_samples == 0) { + _features_min = x_t; + _features_max = x_t; + } else { + for (uint32_t j = 0; j < n_features(); ++j) { + double x_tj = x_t[j]; + if (x_tj < _features_min[j]) { + _features_min[j] = x_tj; + } + if (x_tj > _features_max[j]) { + _features_max[j] = x_tj; + } + } + } +} + +double NodeClassifier::score(uint8_t c) const { + // Using the Dirichet prior + return (_counts[c] + dirichlet()) / (_n_samples + dirichlet() * n_classes()); +} + +inline void NodeClassifier::predict(ArrayDouble &scores) const { + for (uint8_t c = 0; c < n_classes(); ++c) { + scores[c] = score(c); + } +} + +double NodeClassifier::loss(const double y_t) { + // Log-loss + uint8_t c = static_cast(y_t); + return -std::log(score(c)); +} + +inline NodeClassifier &NodeClassifier::node(uint32_t index) const { + return _tree.node(index); +} + +uint32_t NodeClassifier::n_features() const { + return _tree.n_features(); +} + +uint8_t NodeClassifier::n_classes() const { + return _tree.n_classes(); +} + +inline double NodeClassifier::step() const { + return _tree.step(); +} + +inline double NodeClassifier::dirichlet() const { + return _tree.dirichlet(); +} + +inline uint32_t NodeClassifier::parent() const { + return _parent; +} + +inline NodeClassifier &NodeClassifier::set_parent(uint32_t parent) { + _parent = parent; + return *this; +} + +inline uint32_t NodeClassifier::left() const { + return _left; +} + +inline NodeClassifier &NodeClassifier::set_left(uint32_t left) { + _left = left; + return *this; +} + +inline uint32_t NodeClassifier::right() const { + return _right; +} + +inline NodeClassifier &NodeClassifier::set_right(uint32_t right) { + _right = right; + return *this; +} + +inline bool NodeClassifier::is_leaf() const { + return _is_leaf; +} + +inline NodeClassifier &NodeClassifier::set_is_leaf(bool is_leaf) { + _is_leaf = is_leaf; + return *this; +} + +inline uint32_t NodeClassifier::feature() const { + return _feature; +} + +inline NodeClassifier &NodeClassifier::set_feature(uint32_t feature) { + _feature = feature; + return *this; +} + +inline double NodeClassifier::threshold() const { + return _threshold; +} + +inline NodeClassifier &NodeClassifier::set_threshold(double threshold) { + _threshold = threshold; + return *this; +} + +inline double NodeClassifier::time() const { + return _time; +} + +inline NodeClassifier &NodeClassifier::set_time(double time) { + _time = time; + return *this; +} + +inline double NodeClassifier::features_min(const uint32_t j) const { + return _features_min[j]; +} + +inline NodeClassifier &NodeClassifier::set_features_min(const ArrayDouble &features_min) { + _features_min = features_min; + return *this; +} + +inline double NodeClassifier::features_max(const uint32_t j) const { + return _features_max[j]; +} + +inline NodeClassifier &NodeClassifier::set_features_max(const ArrayDouble &features_max) { + _features_max = features_max; + return *this; +} + +inline uint32_t NodeClassifier::n_samples() const { + return _n_samples; +} + +inline NodeClassifier &NodeClassifier::set_n_samples(uint32_t n_samples) { + _n_samples = n_samples; + return *this; +} + +inline bool NodeClassifier::use_aggregation() const { + return _tree.use_aggregation(); +} + +inline double NodeClassifier::weight() const { + return _weight; +} + +inline NodeClassifier &NodeClassifier::set_weight(double weight) { + _weight = weight; + return *this; +} + +inline double NodeClassifier::weight_tree() const { + return _weight_tree; +} + +inline NodeClassifier &NodeClassifier::set_weight_tree(double weight_tree) { + _weight_tree = weight_tree; + return *this; +} + +inline const ArrayDouble &NodeClassifier::x_t() const { + return _x_t; +} + +inline NodeClassifier &NodeClassifier::set_x_t(const ArrayDouble &x_t) { + _x_t = x_t; + return *this; +} + +inline double NodeClassifier::y_t() const { + return _y_t; +} + +inline NodeClassifier &NodeClassifier::set_y_t(const double y_t) { + _y_t = y_t; + return *this; +} + +void NodeClassifier::print() { + std::cout << "Node(parent: " << _parent + << ", left: " << _left + << ", right: " << _right + << ", time: " << std::setprecision(2) << _time + << ", n_samples: " << _n_samples + << ", is_leaf: " << _is_leaf + << ", feature: " << _feature + << ", thresh: " << _threshold + << ", scores: [" << std::setprecision(2) << score(0) << ", " << std::setprecision(2) << score(1) << "]" + << ", counts: [" << std::setprecision(2) << _counts[0] << ", " << std::setprecision(2) << _counts[1] << "]"; + if (_n_samples > 0) { + std::cout << ", min: [" << std::setprecision(2) << _features_min[0] << ", " << std::setprecision(2) + << _features_min[1] << "]" + << ", max: [" << std::setprecision(2) << _features_max[0] << ", " << std::setprecision(2) + << _features_max[1] << "]"; + + } + std::cout << ", weight: " << _weight + << ", weight_tree: " << _weight_tree + << ")\n"; +} + +/********************************************************************************* +* TreeClassifier methods +*********************************************************************************/ + +TreeClassifier::TreeClassifier(const TreeClassifier &tree) + : forest(tree.forest), nodes(tree.nodes) {} + +TreeClassifier::TreeClassifier(const TreeClassifier &&tree) + : forest(tree.forest), nodes(tree.nodes) {} + +TreeClassifier::TreeClassifier(OnlineForestClassifier &forest) : forest(forest) { + // TODO: pre-allocate the vector to make things faster ? + add_node(0); + feature_importances_ = ArrayDouble(forest.n_features()); + feature_importances_.fill(1.); +} + +void TreeClassifier::extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t) { + // std::cout << "Extending the range of: " << index << std::endl; + NodeClassifier ¤t_node = node(node_index); + if (current_node.n_samples() == 0) { + // The node is a leaf with no sample point, so it does not have a range + // In this case we just initialize the range with the given feature. + // This node will then be updated by the call to update_downwards in go_downwards + current_node.set_features_min(x_t); + current_node.set_features_max(x_t); + } else { + // std::cout << "Computing extension" << std::endl; + ArrayDouble extension(n_features()); + double extensions_sum = 0; + for (uint32_t j = 0; j < n_features(); ++j) { + double x_tj = x_t[j]; + double feature_min_j = current_node.features_min(j); + double feature_max_j = current_node.features_max(j); + if (x_tj < feature_min_j) { + extension[j] = feature_min_j - x_tj; + extensions_sum += feature_min_j - x_tj; + } else { + if (x_tj > feature_max_j) { + extension[j] = x_tj - feature_max_j; + extensions_sum += x_tj - feature_max_j; + } else { + extension[j] = 0; + } + } + } +// std::cout << "extension: [" << extension[0] << ", " << std::setprecision(2) << extension[1] << "]" << std::endl; +// std::cout << "extension_sum: " << std::setprecision(2) << extensions_sum << std::endl; +// std::cout << "... Done computing extension." << std::endl; + + // If the sample x_t extends the current range of the node + if (extensions_sum > 0) { + // std::cout << "Extension non-zero, considering the possibility of a split" << std::endl; + bool do_split; + double time = current_node.time(); + double T = forest.sample_exponential(extensions_sum); + // std::cout << "time: " << std::setprecision(2) << time << ", T: " << std::setprecision(2) << T << std::endl; + // Let us determine if we need to split the node or not + if (current_node.is_leaf()) { + // std::cout << "I'll split the node since it's a leaf" << std::endl; + do_split = true; + } else { + // Same as node(current_node.right()).time(); + double child_time = node(current_node.left()).time(); + // Sample a exponential random variable with intensity + if (time + T < child_time) { + // std::cout << " I'll split since time + T < child_time with child_time: " << child_time << std::endl; + do_split = true; + } else { + // std::cout << "I won't split since time + T >= child_time with child_time: " << child_time << std::endl; + do_split = false; + } + } + if (do_split) { + // std::cout << "Starting the splitting of node: " << node_index << std::endl; + // Sample the splitting feature with a probability proportional to the range extensions + ArrayDouble probabilities = extension; + probabilities /= extensions_sum; + // std::cout << "using the probabilities: [" << std::setprecision(2) << probabilities[0] << ", " << std::setprecision(2) << probabilities[1] << "]" << std::endl; + uint32_t feature = forest.sample_feature(probabilities); + // std::cout << "sampled feature: " << feature << std::endl; + double threshold; + // Is the extension on the right side ? + bool is_right_extension = x_t[feature] > current_node.features_max(feature); + // Create new nodes + uint32_t left_new = add_node(node_index, time + T); + uint32_t right_new = add_node(node_index, time + T); + if (is_right_extension) { + // std::cout << "extension is on the right" << std::endl; + threshold = forest.sample_threshold(node(node_index).features_max(feature), x_t[feature]); + // std::cout << "sample inside the extension the threshold: " << threshold << std::endl; + // left_new is the same as node_index, excepted for the parent, time and the fact that it's not a leaf + // std::cout << "Let's copy the current node in the left child" << threshold << std::endl; + node(left_new) = node(node_index); + // donc faut remettre le bon parent et le bon temps + // TODO: set_is_leaf useless for left_new since it's a copy of node_index + // std::cout << "Let's the update the left child" << std::endl; + node(left_new).set_parent(node_index).set_time(time + T); + // right_new doit avoir comme parent node_index + // std::cout << "Let's the update the right child" << std::endl; + node(right_new).set_parent(node_index).set_time(time + T); + // We must tell the old childs that they have a new parent, if the current node is not a leaf + if (!node(node_index).is_leaf()) { + // std::cout << "The current node is not a leaf, so let's not forget to update the old childs" << std::endl; + node(node(node_index).left()).set_parent(left_new); + node(node(node_index).right()).set_parent(left_new); + } + // TODO: faut retourner right_new dans ce cas ? + } else { + // std::cout << "extension is on the left" << std::endl; + threshold = forest.sample_threshold(x_t[feature], node(node_index).features_min(feature)); + node(right_new) = node(node_index); + node(right_new).set_parent(node_index).set_time(time + T); + node(left_new).set_parent(node_index).set_time(time + T); + if (!node(node_index).is_leaf()) { + node(node(node_index).left()).set_parent(right_new); + node(node(node_index).right()).set_parent(right_new); + } + } + // We update the splitting feature, threshold, and childs of the current index + node(node_index).set_feature(feature).set_threshold(threshold).set_left(left_new) + .set_right(right_new).set_is_leaf(false); + } + // Update the range of the node here + node(node_index).update_range(x_t); + } + } + // std::cout << "...Done extending the range." << std::endl; +} + +uint32_t TreeClassifier::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { + // Find the leaf that contains the sample. Start at the root. Index of the root is always 0. + // If predict == true, this is for prediction only, so no leaf update and splits can be done. + uint32_t index_current_node = 0; + bool is_leaf = false; + double loss_t = 0; + uint32_t feature = 0; + + while (!is_leaf) { + if (!predict) { + // Extend the range and eventually split the current node + extend_range(index_current_node, x_t, y_t); + // Update the current node. We get the loss for this point before the node update + // to compute feature importance below + NodeClassifier& current_node = node(index_current_node); + feature = current_node.feature(); + loss_t = current_node.update_downwards(x_t, y_t); + } + // Is the node a leaf ? + NodeClassifier ¤t_node = node(index_current_node); + is_leaf = current_node.is_leaf(); + if (!is_leaf) { + if (x_t[current_node.feature()] <= current_node.threshold()) { + index_current_node = current_node.left(); + } else { + index_current_node = current_node.right(); + } + if(!predict) { + // Compute the difference with the loss of the child + loss_t -= node(index_current_node).loss(y_t); + feature_importances_[feature] += loss_t; + } + } + } + return index_current_node; +} + + +void TreeClassifier::go_upwards(uint32_t leaf_index) { + // std::cout << "Going upwards" << std::endl; + uint32_t current = leaf_index; + while (true) { + NodeClassifier ¤t_node = node(current); + current_node.update_upwards(); + if (current == 0) { + // std::cout << "...Done going upwards." << std::endl; + break; + } + // We must update the root node + current = node(current).parent(); + } +} + +inline uint32_t TreeClassifier::n_nodes() const { + return _n_nodes; +} + +uint32_t TreeClassifier::n_leaves() const { + uint32_t n_leaves = 0; + for (const NodeClassifier &node: nodes) { + if (node.is_leaf()) { + ++n_leaves; + } + } + return n_leaves; +} + +void TreeClassifier::print() { + std::cout << "Tree(n_nodes: " << _n_nodes << std::endl; + std::cout << " "; + uint32_t index = 0; + for (NodeClassifier &node : nodes) { + std::cout << "index: " << index << " "; + node.print(); + index++; + } + std::cout << ")" << std::endl; +} + +void TreeClassifier::fit(const ArrayDouble &x_t, double y_t) { + uint32_t leaf = go_downwards(x_t, y_t, false); + if (use_aggregation()) { + go_upwards(leaf); + } + iteration++; +} + +void TreeClassifier::predict(const ArrayDouble &x_t, ArrayDouble &scores, bool use_aggregation) { +// std::cout << "TreeClassifier::predict" << std::endl; + uint32_t leaf = go_downwards(x_t, 0., true); + if (!use_aggregation) { +// std::cout << "Not using aggregation so using only the leaf's prediction" << std::endl; + node(leaf).predict(scores); + return; + } + +// std::cout << "Done." << std::endl; + uint32_t current = leaf; + // The child of the current node that does not contain the data + ArrayDouble pred_new(n_classes()); + while (true) { +// std::cout << "node: " << current << std::endl; + NodeClassifier ¤t_node = node(current); + if (current_node.is_leaf()) { +// std::cout << "predict leaf" << std::endl; + current_node.predict(scores); + } else { +// std::cout << "predict node" << std::endl; + double w = std::exp(current_node.weight() - current_node.weight_tree()); + // Get the predictions of the current node + current_node.predict(pred_new); + for (uint8_t c = 0; c < n_classes(); ++c) { + scores[c] = 0.5 * w * pred_new[c] + (1 - 0.5 * w) * scores[c]; + } + } + // Root must be updated as well + if (current == 0) { +// std::cout << "Done with predict." << std::endl; + break; + } + current = current_node.parent(); + } +} + +uint32_t TreeClassifier::add_node(uint32_t parent, double time) { + nodes.emplace_back(*this, parent, time); + return _n_nodes++; +} + +inline uint32_t TreeClassifier::n_features() const { + return forest.n_features(); +} + +inline uint8_t TreeClassifier::n_classes() const { + return forest.n_classes(); +} + +inline double TreeClassifier::step() const { + return forest.step(); +} + +inline double TreeClassifier::dirichlet() const { + return forest.dirichlet(); +} + +inline CriterionClassifier TreeClassifier::criterion() const { + return forest.criterion(); +} + +inline bool TreeClassifier::use_aggregation() const { + return forest.use_aggregation(); +} + +/********************************************************************************* + * OnlineForestClassifier methods + *********************************************************************************/ + +OnlineForestClassifier::OnlineForestClassifier(uint32_t n_features, + uint8_t n_classes, + uint8_t n_trees, + uint8_t n_passes, + double step, + CriterionClassifier criterion, + bool use_aggregation, + double subsampling, + double dirichlet, + int32_t n_threads, + int seed, + bool verbose) + : _n_features(n_features), + _n_classes(n_classes), + _n_trees(n_trees), + _n_passes(n_passes), + _step(step), + _criterion(criterion), + _use_aggregation(use_aggregation), + _subsampling(subsampling), + _dirichlet(dirichlet), + _n_threads(n_threads), + _verbose(verbose), + rand(seed) { + // No iteration so far + _iteration = 0; + create_trees(); +} + +OnlineForestClassifier::~OnlineForestClassifier() {} + +void OnlineForestClassifier::create_trees() { + // Just in case... + trees.clear(); + trees.reserve(_n_trees); + // Better tree allocation + for (uint32_t i = 0; i < _n_trees; ++i) { + trees.emplace_back(*this); + } +} + +void OnlineForestClassifier::fit(const SArrayDouble2dPtr features, + const SArrayDoublePtr labels) { + uint32_t n_samples = static_cast(features->n_rows()); + uint32_t n_features = static_cast(features->n_cols()); + if (_iteration == 0) { + _n_features = n_features; + } else { + check_n_features(n_features, false); + } + + // TODO: remove this + _features = features; + _labels = labels; + + // set_n_features(n_features); + + for (uint8_t pass = 0; pass < _n_passes; ++pass) { + for (uint32_t i = 0; i < n_samples; ++i) { + for (TreeClassifier &tree : trees) { + // Fit the tree online using the new data point + double label = (*labels)[i]; + check_label(label); + double U = rand.uniform(); + if (U <= _subsampling) { + tree.fit(view_row(*features, i), (*labels)[i]); + } + } + _iteration++; + } + } + // std::cout << "Done OnlineForestClassifier::fit" << std::endl; +} + +void OnlineForestClassifier::predict(const SArrayDouble2dPtr features, + SArrayDouble2dPtr scores) { + scores->fill(0.); +// std::cout << "features->n_rows(): " << features->n_rows() << ", features->n_cols(): " << features->n_cols() << std::endl; +// std::cout << "scores->n_rows(): " << scores->n_rows() << ", scores->n_cols(): " << scores->n_cols() << std::endl; +// std::cout << "n_classes: " << _n_classes << std::endl; + uint32_t n_features = static_cast(features->n_cols()); + check_n_features(n_features, true); + if (_iteration > 0) { + uint32_t n_samples = static_cast(features->n_rows()); + ArrayDouble scores_tree(_n_classes); + scores_tree.fill(0.); + ArrayDouble scores_forest(_n_classes); + scores_forest.fill(0.); + for (uint32_t i = 0; i < n_samples; ++i) { + // The prediction is simply the average of the predictions + ArrayDouble scores_i = view_row(*scores, i); + for (TreeClassifier &tree : trees) { +// std::cout << "predict for tree " << std::endl; + tree.predict(view_row(*features, i), scores_tree, _use_aggregation); + // TODO: use a .incr method instead ?? + scores_i.mult_incr(scores_tree, 1.); + } + scores_i /= _n_trees; + } + } else { + TICK_ERROR("You must call ``fit`` before ``predict``.") + } +} + +void OnlineForestClassifier::clear() { + create_trees(); + _iteration = 0; +} + +void OnlineForestClassifier::print() { + for (TreeClassifier &tree: trees) { + tree.print(); + } +} + +inline uint32_t OnlineForestClassifier::sample_feature() { + return rand.uniform_int(static_cast(0), n_features() - 1); +} + +//inline uint32_t OnlineForestClassifier::sample_feature_bis() { +// return rand.discrete(_fefeature_importances_); +//} + +inline double OnlineForestClassifier::sample_exponential(double intensity) { + return rand.exponential(intensity); +} + +inline uint32_t OnlineForestClassifier::sample_feature(const ArrayDouble &prob) { +// ArrayDouble my_prob = prob; +// for(uint32_t j = 0; j < n_features(); ++j) { +// // my_prob[j] *= _feature_importances[j]; +// my_prob[j] = _feature_importances[j]; +// } +// my_prob /= my_prob.sum(); + // return rand.discrete(my_prob); + return rand.discrete(prob); + +} + +inline double OnlineForestClassifier::sample_threshold(double left, double right) { + return rand.uniform(left, right); +} + +void OnlineForestClassifier::n_nodes(SArrayUIntPtr n_nodes_per_tree) { + uint8_t j = 0; + for (TreeClassifier &tree : trees) { + (*n_nodes_per_tree)[j] = tree.n_nodes(); + j++; + } +} + +void OnlineForestClassifier::n_leaves(SArrayUIntPtr n_leaves_per_tree) { + uint8_t j = 0; + for (TreeClassifier &tree : trees) { + (*n_leaves_per_tree)[j] = tree.n_leaves(); + j++; + } +} + +bool OnlineForestClassifier::verbose() const { + return _verbose; +} + +OnlineForestClassifier &OnlineForestClassifier::set_verbose(bool verbose) { + _verbose = verbose; + return *this; +} + +bool OnlineForestClassifier::use_aggregation() const { + return _use_aggregation; +} + +double OnlineForestClassifier::step() const { + return _step; +} + +OnlineForestClassifier &OnlineForestClassifier::set_step(const double step) { + _step = step; + return *this; +} + +uint32_t OnlineForestClassifier::n_samples() const { + if (_iteration > 0) { + return _iteration; + } else { + TICK_ERROR("You must call ``fit`` before asking for ``n_samples``.") + } +} + +uint32_t OnlineForestClassifier::n_features() const { + return _n_features; +} + +void OnlineForestClassifier::check_n_features(uint32_t n_features, bool predict) const { + if (n_features != _n_features) { + if (predict) { + TICK_ERROR("Wrong number of features: trained with " + std::to_string(_n_features) + + " features, but received " + std::to_string(n_features) + " features for prediction"); + } else { + TICK_ERROR("Wrong number of features: started to train with " + std::to_string(_n_features) + + " features, but received " + std::to_string(n_features) + " afterwards"); + } + } +} + +void OnlineForestClassifier::check_label(double label) const { + double iptr; + double fptr = std::modf(label, &iptr); + if (fptr != 0) { + TICK_ERROR("Wrong label type: received " + std::to_string(label) + " for a classification problem"); + } + if ((label < 0) || (label >= _n_classes)) { + TICK_ERROR("Wrong label value: received " + std::to_string(label) + " while training for classification with " + + std::to_string(_n_classes) + " classes."); + } +} + +uint8_t OnlineForestClassifier::n_classes() const { + return _n_classes; +} + +uint8_t OnlineForestClassifier::n_trees() const { + return _n_trees; +} + +int32_t OnlineForestClassifier::n_threads() const { + return _n_threads; +} + +CriterionClassifier OnlineForestClassifier::criterion() const { + return _criterion; +} + +int OnlineForestClassifier::seed() const { + return _seed; +} + +OnlineForestClassifier &OnlineForestClassifier::set_seed(int seed) { + _seed = seed; + rand.reseed(seed); + return *this; +} + +OnlineForestClassifier &OnlineForestClassifier::set_n_threads(int32_t n_threads) { + _n_threads = n_threads; + return *this; +} + +OnlineForestClassifier &OnlineForestClassifier::set_criterion(CriterionClassifier criterion) { + _criterion = criterion; + return *this; +} + +//void OnlineForestClassifier::set_feature_importances(const ArrayDouble &feature_importances) { +// _feature_importances = feature_importances; +//} + +double OnlineForestClassifier::dirichlet() const { + return _dirichlet; +} + +OnlineForestClassifier &OnlineForestClassifier::set_dirichlet(const double dirichlet) { + _dirichlet = dirichlet; + return *this; +} + +void OnlineForestClassifier::get_feature_importances(SArrayDoublePtr feature_importances) { + feature_importances->fill(0); + const double a = static_cast(1) / n_trees(); + for (TreeClassifier &tree : trees) { + feature_importances->mult_incr(tree.feature_importances(), a); + } +} diff --git a/tick/inference/src/online_forest_classifier.h b/tick/inference/src/online_forest_classifier.h new file mode 100644 index 000000000..10207ed0e --- /dev/null +++ b/tick/inference/src/online_forest_classifier.h @@ -0,0 +1,309 @@ + +#ifndef TICK_ONLINE_FOREST_CLASSIFIER_H +#define TICK_ONLINE_FOREST_CLASSIFIER_H + +// License: BSD 3 clause + +#include "base.h" +#include +#include +#include "../../random/src/rand.h" + +// TODO: change the Dirichlet parameter +// TODO: reserve nodes in advance +// TODO: set_feature_importances with a nullptr by default +// TODO: subsample parameter, default 0.5 + +// TODO: tree aggregation +// TODO: subsampling in the columns and the rows +// TODO: memory optimization (a FeatureSplitter), maximum (sizeof(uint8_t) splits)), a set of current splits +// TODO: only binary features version ? +// TODO: + +enum class CriterionClassifier { + log = 0, +}; + +class TreeClassifier; + +/********************************************************************************* + * NodeClassifier + *********************************************************************************/ + +class NodeClassifier { + protected: + // Tree containing the node + TreeClassifier &_tree; + // Index of the parent + uint32_t _parent; + // Index of the left child + uint32_t _left; + // Index of the right child + uint32_t _right; + // Index of the feature used for the split + uint32_t _feature; + // Threshold used for the split + double _threshold; + // Time of creation of the node + double _time; + // Range of the features + ArrayDouble _features_min; + ArrayDouble _features_max; + // Number of samples in the node + uint32_t _n_samples; + // The features of the sample saved in the node + // TODO: use a unique_ptr on x_t + ArrayDouble _x_t; + // The label of the sample saved in the node + double _y_t; + // Logarithm of the aggregation weight for the node + double _weight; + // Logarithm of the agregation weight for the sub-tree starting at this node + double _weight_tree; + // true if the node is a leaf + bool _is_leaf; + // Counts the number of sample seen in each class + ArrayULong _counts; + + public: + NodeClassifier(TreeClassifier &tree, uint32_t parent, double time = 0); + NodeClassifier(const NodeClassifier &node); + NodeClassifier(const NodeClassifier &&node); + NodeClassifier &operator=(const NodeClassifier &); + NodeClassifier &operator=(const NodeClassifier &&) = delete; + + // Computation of log( (e^a + e^b) / 2) in an overproof way + inline static double log_sum_2_exp(const double a, const double b) { + // TODO: if |a - b| > 50 skip + if (a > b) { + return a + std::log((1 + std::exp(b - a)) / 2); + } else { + return b + std::log((1 + std::exp(a - b)) / 2); + } + } + + // Update to apply to a node when going forward in the tree (towards leaves) + double update_downwards(const ArrayDouble &x_t, const double y_t); + // Update to apply to a node when going upward in the tree (towards the root) + void update_upwards(); + // Update the prediction of the label + void update_predict(const double y_t); + // Update range of the seen features + void update_range(const ArrayDouble &x_t); + // Predict function (average of the labels of samples that passed through the node) + void predict(ArrayDouble &scores) const; + // Loss function used for aggregation + + double score(uint8_t y) const; + + double loss(const double y_t); + + bool is_same(const ArrayDouble &x_t); + + // Get node at index in the tree + inline NodeClassifier &node(uint32_t index) const; + + // Get number of features + inline uint32_t n_features() const; + // Number of classes + inline uint8_t n_classes() const; + // Step to use for aggregation + inline double step() const; + // + inline double dirichlet() const; + // Print of the node + void print(); + + inline uint32_t parent() const; + inline NodeClassifier &set_parent(uint32_t parent); + inline uint32_t left() const; + inline NodeClassifier &set_left(uint32_t left); + inline uint32_t right() const; + inline NodeClassifier &set_right(uint32_t right); + inline bool is_leaf() const; + inline NodeClassifier &set_is_leaf(bool is_leaf); + inline uint32_t feature() const; + inline NodeClassifier &set_feature(uint32_t feature); + inline double threshold() const; + inline NodeClassifier &set_threshold(double threshold); + inline double time() const; + inline NodeClassifier &set_time(double time); + inline double features_min(const uint32_t j) const; + inline NodeClassifier &set_features_min(const ArrayDouble &features_min); + inline double features_max(const uint32_t j) const; + inline NodeClassifier &set_features_max(const ArrayDouble &features_max); + inline uint32_t n_samples() const; + inline NodeClassifier &set_n_samples(uint32_t n_samples); + inline bool use_aggregation() const; + inline double weight() const; + inline NodeClassifier &set_weight(double weight); + inline double weight_tree() const; + inline NodeClassifier &set_weight_tree(double weight); + inline const ArrayDouble &x_t() const; + inline NodeClassifier &set_x_t(const ArrayDouble &x_t); + inline double y_t() const; + inline NodeClassifier &set_y_t(const double y_t); +}; + +class OnlineForestClassifier; + +/********************************************************************************* + * TreeClassifier + *********************************************************************************/ + +class TreeClassifier { + protected: + // The forest of the tree + OnlineForestClassifier &forest; + // Number of nodes in the tree + uint32_t _n_nodes = 0; + // Iteration counter + uint32_t iteration = 0; + // Nodes of the tree + std::vector nodes = std::vector(); + // Split the node at given index + // uint32_t split_leaf(uint32_t index, const ArrayDouble &x_t, double y_t); + // Add nodes in the tree + uint32_t add_node(uint32_t parent, double time = 0); + + ArrayDouble feature_importances_; + + void extend_range(uint32_t node_index, const ArrayDouble &x_t, const double y_t); + + uint32_t go_downwards(const ArrayDouble &x_t, double y_t, bool predict); + void go_upwards(uint32_t leaf_index); + + public: + TreeClassifier(OnlineForestClassifier &forest); + TreeClassifier(const TreeClassifier &tree); + TreeClassifier(const TreeClassifier &&tree); + TreeClassifier &operator=(const TreeClassifier &) = delete; + TreeClassifier &operator=(const TreeClassifier &&) = delete; + + void fit(const ArrayDouble &x_t, double y_t); + void predict(const ArrayDouble &x_t, ArrayDouble &scores, bool use_aggregation); + + inline uint32_t n_features() const; + inline uint8_t n_classes() const; + inline uint32_t n_nodes() const; + uint32_t n_leaves() const; + inline double step() const; + inline double dirichlet() const; + + void print(); + + inline CriterionClassifier criterion() const; + inline bool use_aggregation() const; + + NodeClassifier &node(uint32_t index) { + return nodes[index]; + } + + inline ArrayDouble& feature_importances() { + return feature_importances_; + } +}; + +/********************************************************************************* + * OnlineForestClassifier + *********************************************************************************/ + +class OnlineForestClassifier { + private: + // Number of features + uint32_t _n_features; + // Number of classes in the classification problem + uint8_t _n_classes; + // Number of Trees in the forest + uint8_t _n_trees; + // Number of passes over each given data + uint8_t _n_passes; + // Step-size used for aggregation + double _step; + // CriterionClassifier used for splitting (not used for now) + CriterionClassifier _criterion; + // + bool _use_aggregation; + // + double _subsampling; + // + double _dirichlet; + // Number of threads to use for parallel growing of trees + int32_t _n_threads; + // Seed for random number generation + int _seed; + // Verbose things or not + bool _verbose; + // Iteration counter + uint32_t _iteration; + // The list of trees in the forest + std::vector trees; + // Random number generator for feature and threshold sampling + Rand rand; + + // ArrayDouble _feature_importances; + // Create trees + void create_trees(); + + SArrayDouble2dPtr _features; + SArrayDoublePtr _labels; + + void check_n_features(uint32_t n_features, bool predict) const; + inline void check_label(double label) const; + + public: + OnlineForestClassifier(uint32_t n_features, + uint8_t n_classes, + uint8_t n_trees, + uint8_t n_passes = 1, + double step = 1.0, + CriterionClassifier criterion = CriterionClassifier::log, + bool use_aggregation = true, + double subsampling = 1, + double dirichlet = 0.5, + int32_t n_threads = 1, + int seed = 0, + bool verbose = false); + virtual ~OnlineForestClassifier(); + + void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); + void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr scores); + + inline uint32_t sample_feature(); + inline uint32_t sample_feature(const ArrayDouble &prob); + // inline uint32_t sample_feature_bis(); + inline double sample_exponential(double intensity); + inline double sample_threshold(double left, double right); + + void clear(); + void print(); + + uint32_t n_samples() const; + uint32_t n_features() const; + uint8_t n_classes() const; + + uint8_t n_trees() const; + bool use_aggregation() const; + double step() const; + OnlineForestClassifier &set_step(const double step); + double dirichlet() const; + OnlineForestClassifier &set_dirichlet(const double dirichlet); + bool verbose() const; + OnlineForestClassifier &set_verbose(bool verbose); + CriterionClassifier criterion() const; + OnlineForestClassifier &set_criterion(CriterionClassifier criterion); + int32_t n_threads() const; + OnlineForestClassifier &set_n_threads(int32_t n_threads); + int seed() const; + OnlineForestClassifier &set_seed(int seed); + + void n_nodes(SArrayUIntPtr n_nodes_per_tree); + void n_leaves(SArrayUIntPtr n_leaves_per_tree); + + // void set_feature_importances(const ArrayDouble &feature_importances); + + void get_feature_importances(SArrayDoublePtr feature_importances); + +}; + +#endif //TICK_ONLINE_FOREST_CLASSIFIER_H diff --git a/tick/inference/src/online_forest_regressor.cpp b/tick/inference/src/online_forest_regressor.cpp new file mode 100644 index 000000000..253ae48eb --- /dev/null +++ b/tick/inference/src/online_forest_regressor.cpp @@ -0,0 +1,467 @@ + +// License: BSD 3 clause + +#include "online_forest_regressor.h" + +/********************************************************************************* + * NodeRegressor methods + *********************************************************************************/ + +NodeRegressor::NodeRegressor(TreeRegressor &tree, ulong parent) + : _tree(tree) { + _parent = parent; + _left = 0; + _right = 0; + _n_samples = 0; + _is_leaf = true; + _weight = 0; + _weight_tree = 0; + _predict = 0; +} + +NodeRegressor::NodeRegressor(const NodeRegressor &node) + : _tree(node._tree), + _parent(node._parent), _left(node._left), _right(node._right), + _feature(node._feature), _threshold(node._threshold), + _n_samples(node._n_samples), + _x_t(node._x_t), + _y_t(node._y_t), + _weight(node._weight), _weight_tree(node._weight_tree), + _is_leaf(node._is_leaf), + _predict(node._predict) {} + +NodeRegressor::NodeRegressor(const NodeRegressor &&node) : _tree(_tree) { + _left = node._left; + _right = node._right; + _parent = node._parent; + _feature = node._feature; + _threshold = node._threshold; + _n_samples = node._n_samples; + _weight = node._weight; + _weight_tree = node._weight_tree; + _is_leaf = node._is_leaf; + _x_t = node._x_t; + _y_t = node._y_t; +} + +void NodeRegressor::update_downwards(const ArrayDouble &x_t, const double y_t) { + _n_samples++; + _weight -= step() * loss(y_t); + update_predict(y_t); +} + +void NodeRegressor::update_upwards() { + if (_is_leaf) { + _weight_tree = _weight; + } else { + _weight_tree = log_sum_2_exp(_weight, node(_left).weight_tree() + node(_right).weight_tree()); + } +} + +void NodeRegressor::update_predict(const double y_t) { + // When a node is updated, it necessarily contains already a sample + _predict = ((_n_samples - 1) * _predict + y_t) / _n_samples; +} + +double NodeRegressor::loss(const double y_t) { + double diff = _predict - y_t; + return diff * diff / 2; +} + +inline NodeRegressor &NodeRegressor::node(ulong index) const { + return _tree.node(index); +} + +ulong NodeRegressor::n_features() const { + return _tree.n_features(); +} + +inline double NodeRegressor::step() const { + return _tree.step(); +} + +inline ulong NodeRegressor::parent() const { + return _parent; +} + +inline ulong NodeRegressor::left() const { + return _left; +} + +inline NodeRegressor &NodeRegressor::set_left(ulong left) { + _left = left; + return *this; +} + +inline ulong NodeRegressor::right() const { + return _right; +} + +inline NodeRegressor &NodeRegressor::set_right(ulong right) { + _right = right; + return *this; +} + +inline bool NodeRegressor::is_leaf() const { + return _is_leaf; +} + +inline NodeRegressor &NodeRegressor::set_is_leaf(bool is_leaf) { + _is_leaf = is_leaf; + return *this; +} + +inline ulong NodeRegressor::feature() const { + return _feature; +} + +inline NodeRegressor &NodeRegressor::set_feature(ulong feature) { + _feature = feature; + return *this; +} + +inline double NodeRegressor::threshold() const { + return _threshold; +} + +inline NodeRegressor &NodeRegressor::set_threshold(double threshold) { + _threshold = threshold; + return *this; +} + +inline ulong NodeRegressor::n_samples() const { + return _n_samples; +} + +inline NodeRegressor &NodeRegressor::set_n_samples(ulong n_samples) { + _n_samples = n_samples; + return *this; +} + +inline double NodeRegressor::weight() const { + return _weight; +} + +inline NodeRegressor &NodeRegressor::set_weight(double weight) { + _weight = weight; + return *this; +} + +inline double NodeRegressor::weight_tree() const { + return _weight_tree; +} + +inline NodeRegressor &NodeRegressor::set_weight_tree(double weight_tree) { + _weight_tree = weight_tree; + return *this; +} + +inline const ArrayDouble &NodeRegressor::x_t() const { + return _x_t; +} + +inline NodeRegressor &NodeRegressor::set_x_t(const ArrayDouble &x_t) { + _x_t = x_t; + return *this; +} + +inline double NodeRegressor::y_t() const { + return _y_t; +} + +inline NodeRegressor &NodeRegressor::set_y_t(const double y_t) { + _y_t = y_t; + return *this; +} + +inline double NodeRegressor::predict() const { + return _predict; +} + +void NodeRegressor::print() { + std::cout << "Node(parent: " << _parent + << ", left: " << _left + << ", right: " << _right + << ", n_samples: " << _n_samples + << ", is_leaf: " << _is_leaf + << ", feature: " << _feature + << ", thresh: " << _threshold + << ", predict: " << _predict + << ", weight: " << _weight + << ", weight_tree: " << _weight_tree + << ")\n"; +} + +/********************************************************************************* +* TreeRegressor methods +*********************************************************************************/ + +TreeRegressor::TreeRegressor(const TreeRegressor &tree) + : forest(tree.forest), nodes(tree.nodes) {} + +TreeRegressor::TreeRegressor(const TreeRegressor &&tree) + : forest(tree.forest), nodes(tree.nodes) {} + +TreeRegressor::TreeRegressor(OnlineForestRegressor &forest) : forest(forest) { + // TODO: pre-allocate the vector to make things faster ? + add_node(0); +} + +ulong TreeRegressor::split_leaf(ulong index, const ArrayDouble &x_t, double y_t) { + // std::cout << "Splitting node " << index << std::endl; + ulong left = add_node(index); + ulong right = add_node(index); + node(index).set_left(left).set_right(right).set_is_leaf(false); + + // TODO: better feature sampling + ulong feature = forest.sample_feature(); + + double x1_tj = x_t[feature]; + double x2_tj = node(index).x_t()[feature]; + double threshold; + + // The leaf that contains the passed sample (x_t, y_t) + ulong data_leaf; + ulong other_leaf; + + // std::cout << "x1_tj= " << x1_tj << " x2_tj= " << x2_tj << " threshold= " << threshold << std::endl; + // TODO: what if x1_tj == x2_tj. Must be taken care of by sample_feature() + if (x1_tj < x2_tj) { + threshold = forest.sample_threshold(x1_tj, x2_tj); + data_leaf = left; + other_leaf = right; + } else { + threshold = forest.sample_threshold(x2_tj, x1_tj); + data_leaf = right; + other_leaf = left; + } + // TODO: code a move_sample + NodeRegressor & current_node = node(index); + NodeRegressor & data_node = node(data_leaf); + NodeRegressor & other_node = node(other_leaf); + current_node.set_feature(feature).set_threshold(threshold); + // We pass the sample to the new leaves, and initialize the _label_average with the value + data_node.set_x_t(x_t).set_y_t(y_t); + + // other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); + other_node.set_x_t(current_node.x_t()).set_y_t(current_node.y_t()); + + // Update downwards of v' + other_node.update_downwards(current_node.x_t(), current_node.y_t()); + // Update upwards of v': it's a leaf + other_node.update_upwards(); + // node(other_leaf).set_weight_tree(node(other_leaf).weight()); + // Update downwards of v'' + data_node.update_downwards(x_t, y_t); + // Note: the update_up of v'' is done in the go_up method, called in fit() + return data_leaf; +} + +ulong TreeRegressor::go_downwards(const ArrayDouble &x_t, double y_t, bool predict) { + // Find the leaf that contains the sample + // Start at the root. Index of the root is always 0 + // If predict == true, this call to find_leaf is for + // prediction only, so that no leaf update and splits can be done + ulong index_current_node = 0; + bool is_leaf = false; + while (!is_leaf) { + // Get the current node + NodeRegressor ¤t_node = node(index_current_node); + if (!predict) { + current_node.update_downwards(x_t, y_t); + } + // Is the node a leaf ? + is_leaf = current_node.is_leaf(); + if (!is_leaf) { + if (x_t[current_node.feature()] <= current_node.threshold()) { + index_current_node = current_node.left(); + } else { + index_current_node = current_node.right(); + } + } + } + return index_current_node; +} + +void TreeRegressor::go_upwards(ulong leaf_index) { + ulong current = leaf_index; + while (true) { + NodeRegressor ¤t_node = node(current); + current_node.update_upwards(); + if (current == 0) { + break; + } + // We must update the root node + current = node(current).parent(); + } +} + +inline ulong TreeRegressor::n_nodes() const { + return _n_nodes; +} + +void TreeRegressor::fit(const ArrayDouble &x_t, double y_t) { + // TODO: Test that the size does not change within successive calls to fit + if (iteration == 0) { + nodes[0].set_x_t(x_t).set_y_t(y_t); + iteration++; + return; + } + ulong leaf = go_downwards(x_t, y_t, false); + ulong new_leaf = split_leaf(leaf, x_t, y_t); +// for(ulong j=0; j < n_features(); ++j) { +// double delta = std::abs(x_t[j] - node(leaf).sample().first[j]); +// if (delta > 0.) { +// new_leaf = split_node(leaf, x_t, y_t); +// break; +// } +// } + go_upwards(new_leaf); + iteration++; +} + +double TreeRegressor::predict(const ArrayDouble &x_t, bool use_aggregation) { + // std::cout << "Going downwards" << std::endl; + ulong leaf = go_downwards(x_t, 0., true); + // std::cout << "Done." << std::endl; + if (!use_aggregation) { + return node(leaf).y_t(); + } + ulong current = leaf; + // The child of the current node that does not contain the data + ulong other_index; + ulong parent; + ulong data_index; + double pred; + while (true) { + // std::cout << "node: " << current << std::endl; + NodeRegressor ¤t_node = node(current); + if (current_node.is_leaf()) { + pred = current_node.predict(); + } else { +// parent = current_node.parent(); +// if (node(parent).left() == current) { +// // other_index = node(parent).right(); +// data_index = node(parent).left(); +// +// } else { +// // other_index = node(parent).left(); +// data_index = node(parent).right(); +// } +// NodeRegressor& data_node = node(data_index); + double w = std::exp(current_node.weight() - current_node.weight_tree()); + pred = 0.5 * w * current_node.predict() + (1 - 0.5 * w) * pred; + +// weight = 0.5 * current_node.weight() * current_node.predict() +// + 0.5 * node(other).weight_tree() * weight; +// weight = 0.5 * std::exp(current_node.weight()) * current_node.labels_average() +// + 0.5 * std::exp(node(other).weight_tree() + weight); + } + // std::cout << "pred: " << pred << std::endl; + // Root must be updated as well + if (current == 0) { + break; + } + current = current_node.parent(); + } + // return weight / nodes[0].weight_tree(); + return pred; + // return weight / std::exp(nodes[0].weight_tree()); +} + +ulong TreeRegressor::add_node(ulong parent) { + nodes.emplace_back(*this, parent); + return _n_nodes++; +} + +inline ulong TreeRegressor::n_features() const { + return forest.n_features(); +} + +inline double TreeRegressor::step() const { + return forest.step(); +} + +inline CriterionRegressor TreeRegressor::criterion() const { + return forest.criterion(); +} + +/********************************************************************************* + * OnlineForestRegressor methods + *********************************************************************************/ + +OnlineForestRegressor::OnlineForestRegressor(uint32_t n_trees, + double step, + CriterionRegressor criterion, + int32_t n_threads, + int seed, + bool verbose) + : // _n_trees(n_trees), + _n_threads(n_threads), _criterion(criterion), _step(step), _verbose(verbose), trees() { + // No iteration so far + _n_trees = 10; + _iteration = 0; + create_trees(); + // Seed the random number generators + set_seed(seed); +} + +OnlineForestRegressor::~OnlineForestRegressor() {} + +void OnlineForestRegressor::create_trees() { + // Just in case... + trees.clear(); + trees.reserve(_n_trees); + for (uint32_t i = 0; i < _n_trees; ++i) { + trees.emplace_back(*this); + } +} + +void OnlineForestRegressor::fit(const SArrayDouble2dPtr features, + const SArrayDoublePtr labels) { + ulong n_samples = features->n_rows(); + ulong n_features = features->n_cols(); + set_n_features(n_features); + for (ulong i = 0; i < n_samples; ++i) { + for (TreeRegressor &tree : trees) { + // Fit the tree online using the new data point + tree.fit(view_row(*features, i), (*labels)[i]); + } + _iteration++; + } +} + +void OnlineForestRegressor::predict(const SArrayDouble2dPtr features, + SArrayDoublePtr predictions, + bool use_aggregation) { + if (_iteration > 0) { + ulong n_samples = features->n_rows(); + for (ulong i = 0; i < n_samples; ++i) { + // The prediction is simply the average of the predictions + double y_pred = 0; + for (TreeRegressor &tree : trees) { + y_pred += tree.predict(view_row(*features, i), use_aggregation); + } + (*predictions)[i] = y_pred / _n_trees; + } + } else { + TICK_ERROR("You must call ``fit`` before ``predict``.") + } +} + +inline ulong OnlineForestRegressor::sample_feature() { + return rand.uniform_int(0L, n_features() - 1); +} + +inline double OnlineForestRegressor::sample_threshold(double left, double right) { + return rand.uniform(left, right); +} + +//inline bool OnlineForestRegressor::verbose() const { +// return _verbose; +//} +// +//inline OnlineForestRegressor &OnlineForestRegressor::set_verbose(bool verbose) { +// _verbose = verbose; +// return *this; +//} diff --git a/tick/inference/src/online_forest_regressor.h b/tick/inference/src/online_forest_regressor.h new file mode 100644 index 000000000..224f02867 --- /dev/null +++ b/tick/inference/src/online_forest_regressor.h @@ -0,0 +1,281 @@ + +#ifndef TICK_ONLINEFOREST_H +#define TICK_ONLINEFOREST_H + +// License: BSD 3 clause + +#include "base.h" +#include +#include "../../random/src/rand.h" + + +enum class CriterionRegressor { + unif = 0, + mse +}; + + + +class TreeRegressor; + +/********************************************************************************* + * NodeRegressor + *********************************************************************************/ + +class NodeRegressor { + protected: + // Tree containing the node + TreeRegressor &_tree; + // Index of the parent + ulong _parent; + // Index of the left child + ulong _left; + // Index of the right child + ulong _right; + // Index of the feature used for the split + ulong _feature; + // Threshold used for the split + double _threshold; + // Number of samples in the node + ulong _n_samples; + // The features of the sample saved in the node + // TODO: use a unique_ptr on x_t + ArrayDouble _x_t; + // The label of the sample saved in the node + double _y_t; + // Logarithm of the aggregation weight for the node + double _weight; + // Logarithm of the agregation weight for the sub-tree starting at this node + double _weight_tree; + // true if the node is a leaf + bool _is_leaf; + // Average of the labels in the node (regression only for now) + double _predict = 0; + + public: + NodeRegressor(TreeRegressor &tree, ulong parent); + NodeRegressor(const NodeRegressor &node); + NodeRegressor(const NodeRegressor &&node); + NodeRegressor &operator=(const NodeRegressor &) = delete; + NodeRegressor &operator=(const NodeRegressor &&) = delete; + + // Computation of log( (e^a + e^b) / 2) in an overproof way + inline static double log_sum_2_exp(const double a, const double b) { + // TODO if |a - b| > 50 skip + if (a > b) { + return a + std::log((1 + std::exp(b - a)) / 2); + } else { + return b + std::log((1 + std::exp(a - b)) / 2); + } + } + // Update to apply to a node when going forward in the tree (towards leaves) + void update_downwards(const ArrayDouble &x_t, const double y_t); + // Update to apply to a node when going upward in the tree (towards the root) + void update_upwards(); + // Update the prediction of the label + void update_predict(const double y_t); + // Predict function (average of the labels of samples that passed through the node) + double predict() const; + // Loss function used for aggregation + double loss(const double y_t); + // Get node at index in the tree + inline NodeRegressor &node(ulong index) const; + // Get number of features + inline ulong n_features() const; + // Step to use for aggrgation + inline double step() const; + // Print of the node + void print(); + + inline ulong parent() const; + inline ulong left() const; + inline NodeRegressor &set_left(ulong left); + inline ulong right() const; + inline NodeRegressor &set_right(ulong right); + inline bool is_leaf() const; + inline NodeRegressor &set_is_leaf(bool is_leaf); + inline ulong feature() const; + inline NodeRegressor &set_feature(ulong feature); + inline double threshold() const; + inline NodeRegressor &set_threshold(double threshold); + inline ulong n_samples() const; + inline NodeRegressor &set_n_samples(ulong n_samples); + inline double weight() const; + inline NodeRegressor &set_weight(double weight); + inline double weight_tree() const; + inline NodeRegressor &set_weight_tree(double weight); + inline const ArrayDouble &x_t() const; + inline NodeRegressor &set_x_t(const ArrayDouble &x_t); + inline double y_t() const; + inline NodeRegressor &set_y_t(const double y_t); +}; + +class OnlineForestRegressor; + +/********************************************************************************* + * TreeRegressor + *********************************************************************************/ + +class TreeRegressor { + protected: + // The forest of the tree + OnlineForestRegressor &forest; + // Number of nodes in the tree + ulong _n_nodes = 0; + // Iteration counter + ulong iteration = 0; + // Nodes of the tree + std::vector nodes = std::vector(); + // Split the node at given index + ulong split_leaf(ulong index, const ArrayDouble &x_t, double y_t); + // Add nodes in the tree + ulong add_node(ulong parent); + + ulong go_downwards(const ArrayDouble &x_t, double y_t, bool predict); + void go_upwards(ulong leaf_index); + + public: + TreeRegressor(OnlineForestRegressor &forest); + TreeRegressor(const TreeRegressor &tree); + TreeRegressor(const TreeRegressor &&tree); + TreeRegressor &operator=(const TreeRegressor &) = delete; + TreeRegressor &operator=(const TreeRegressor &&) = delete; + + void fit(const ArrayDouble &x_t, double y_t); + double predict(const ArrayDouble &x_t, bool use_aggregation); + + inline ulong n_features() const; + inline ulong n_nodes() const; + inline double step() const; + + void print() { + for (NodeRegressor &node : nodes) { + node.print(); + } + } + + inline CriterionRegressor criterion() const; + + NodeRegressor &node(ulong index) { + return nodes[index]; + } +}; + +/********************************************************************************* + * OnlineForestRegressor + *********************************************************************************/ + +class OnlineForestRegressor { + private: + // Number of Trees in the forest + uint32_t _n_trees; + // Number of threads to use for parallel growing of trees + int32_t _n_threads; + // CriterionRegressor used for splitting (not used for now) + CriterionRegressor _criterion; + // Step-size used for aggregation + double _step; + // Number of features. + ulong _n_features; + // Seed for random number generation + int _seed; + // Verbose things or not + bool _verbose; + // Iteration counter + ulong _iteration; + // The list of trees in the forest + std::vector trees; + // Random number generator for feature and threshold sampling + Rand rand; + // Create trees + void create_trees(); + + public: + OnlineForestRegressor(uint32_t n_trees, double step, CriterionRegressor criterion, int32_t n_threads, + int seed, bool verbose); + virtual ~OnlineForestRegressor(); + + void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); + void predict(const SArrayDouble2dPtr features, SArrayDoublePtr predictions, bool use_aggregation); + + inline ulong sample_feature(); + inline double sample_threshold(double left, double right); + + inline double step() const { + return _step; + } + + void print() { + for (TreeRegressor &tree: trees) { + tree.print(); + } + } + + inline ulong n_samples() const { + if (_iteration > 0) { + return _iteration; + } else { + TICK_ERROR("You must call ``fit`` before asking for ``n_samples``.") + } + } + + inline ulong n_features() const { + if (_iteration > 0) { + return _n_features; + } else { + TICK_ERROR("You must call ``fit`` before asking for ``n_features``.") + } + } + + inline OnlineForestRegressor &set_n_features(ulong n_features) { + if (_iteration == 0) { + _n_features = n_features; + } else { + TICK_ERROR("OnlineForest::set_n_features can be called only once !") + } + return *this; + } + + inline uint32_t n_trees() const { + return _n_trees; + } + + inline OnlineForestRegressor &set_n_trees(uint32_t n_trees) { + _n_trees = n_trees; + return *this; + } + + inline int32_t n_threads() const { + return _n_threads; + } + + inline CriterionRegressor criterion() const { + return _criterion; + } + + inline int seed() const { + return _seed; + } + + inline OnlineForestRegressor &set_seed(int seed) { + _seed = seed; + rand.reseed(seed); + return *this; + } + + OnlineForestRegressor &set_n_threads(int32_t n_threads) { + _n_threads = n_threads; + return *this; + } + + inline OnlineForestRegressor &set_criterion(CriterionRegressor criterion) { + _criterion = criterion; + return *this; + } + + +// inline bool verbose() const; +// inline OnlineForestRegressor &set_verbose(bool verbose); +}; + +#endif //TICK_ONLINEFOREST_H diff --git a/tick/inference/swig/inference_module.i b/tick/inference/swig/inference_module.i index 964c9d72f..281c4d842 100644 --- a/tick/inference/swig/inference_module.i +++ b/tick/inference/swig/inference_module.i @@ -22,4 +22,7 @@ %include hawkes_em.i %include hawkes_adm4.i %include hawkes_basis_kernels.i -%include hawkes_sumgaussians.i \ No newline at end of file +%include hawkes_sumgaussians.i + +%include online_forest_regressor.i +%include online_forest_classifier.i diff --git a/tick/inference/swig/online_forest_classifier.i b/tick/inference/swig/online_forest_classifier.i new file mode 100644 index 000000000..02b0c085a --- /dev/null +++ b/tick/inference/swig/online_forest_classifier.i @@ -0,0 +1,57 @@ +// License: BSD 3 clause + +%include std_shared_ptr.i +%shared_ptr(OnlineForestRegressor); + +%{ +#include "online_forest_classifier.h" +%} + + +enum class CriterionClassifier { + log = 0 +}; + +class OnlineForestClassifier { + public: + + OnlineForestClassifier(uint32_t n_features, uint8_t n_classes, uint8_t n_trees, + uint8_t n_passes = 1, double step = 1.0, + CriterionClassifier criterion = CriterionClassifier::log, + bool use_aggregation = true, double subsampling=1, double dirichlet=0.5, + int32_t n_threads = 1, int seed = 0, bool verbose = false); + + void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); + void predict(const SArrayDouble2dPtr features, SArrayDouble2dPtr predictions); + void clear(); + void print(); + + uint32_t n_samples() const; + uint32_t n_features() const; + uint8_t n_classes() const; + + inline double step() const; + inline OnlineForestClassifier& set_step(const double step); + + uint32_t n_trees() const; + + int32_t n_threads() const; + OnlineForestClassifier &set_n_threads(int32_t n_threads); + + CriterionClassifier criterion() const; + OnlineForestClassifier &set_criterion(CriterionClassifier criterion); + + int seed() const; + OnlineForestClassifier &set_seed(int seed); + + void n_nodes(SArrayUIntPtr n_nodes_per_tree); + void n_leaves(SArrayUIntPtr n_leaves_per_tree); + + bool verbose() const; + OnlineForestRegressor &set_verbose(bool verbose); + + // void set_feature_importances(const ArrayDouble &feature_importances); + + void get_feature_importances(SArrayDoublePtr feature_importances); + +}; diff --git a/tick/inference/swig/online_forest_regressor.i b/tick/inference/swig/online_forest_regressor.i new file mode 100644 index 000000000..f79b65de8 --- /dev/null +++ b/tick/inference/swig/online_forest_regressor.i @@ -0,0 +1,42 @@ +// License: BSD 3 clause + +%include std_shared_ptr.i +%shared_ptr(OnlineForestRegressor); + +%{ +#include "online_forest_regressor.h" +%} + + +enum class CriterionRegressor { + unif = 0, + mse +}; + +class OnlineForestRegressor { + public: + OnlineForestRegressor(uint32_t n_trees, double step, CriterionRegressor criterion, + int32_t n_threads, int seed, bool verbose); + + void fit(const SArrayDouble2dPtr features, const SArrayDoublePtr labels); + void predict(const SArrayDouble2dPtr features, SArrayDoublePtr predictions, bool use_aggregation); + + inline double step() const; + void print(); + + ulong n_samples() const; + ulong n_features() const; + OnlineForestRegressor &set_n_features(ulong n_features); + + // uint32_t n_trees() const; + // OnlineForestRegressor &set_n_trees(uint32_t n_trees); + + int32_t n_threads() const; + OnlineForestRegressor &set_n_threads(int32_t n_threads); + CriterionRegressor criterion() const; + OnlineForestRegressor &set_criterion(CriterionRegressor criterion); + int seed() const; + OnlineForestRegressor &set_seed(int seed); + // bool verbose() const; + // OnlineForestRegressor &set_verbose(bool verbose); +}; diff --git a/tick/inference/tests/online_forest_classifier_test.py b/tick/inference/tests/online_forest_classifier_test.py new file mode 100644 index 000000000..9f36faa34 --- /dev/null +++ b/tick/inference/tests/online_forest_classifier_test.py @@ -0,0 +1,50 @@ +# License: BSD 3 clause + +import unittest +from tick.inference.tests.inference import InferenceTest + +from sklearn.datasets import make_moons, make_classification, make_circles + +from tick.inference import OnlineForestClassifier + + +# Test + +class Test(InferenceTest): + + def test_online_forest_n_features_differs(self): + n_samples = 1000 + n_classes = 2 + n_trees = 20 + + X, y = make_classification(n_samples=n_samples, n_features=10, + n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) + rng = np.random.RandomState(2) + X += 2 * rng.uniform(size=X.shape) + + of = OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, + step=1., + use_aggregation=True) + + of.fit(X, y) + + X, y = make_classification(n_samples=n_samples, n_features=10, + n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) + + of.fit(X, y) + + X, y = make_classification(n_samples=n_samples, n_features=3, + n_redundant=0, + n_informative=2, random_state=1, + n_clusters_per_class=1) + + + def test_online_forest_n_classes_differs(self): + pass + +if __name__ == "__main__": + unittest.main() diff --git a/tick/random/src/rand.cpp b/tick/random/src/rand.cpp index 91b8ebb54..9dbde0c6e 100644 --- a/tick/random/src/rand.cpp +++ b/tick/random/src/rand.cpp @@ -51,6 +51,11 @@ ulong Rand::uniform_int(ulong a, ulong b) { return uniform_ulong_dist(generator, p); } +uint32_t Rand::uniform_int(uint32_t a, uint32_t b) { + std::uniform_int_distribution::param_type p(a, b); + return uniform_uint32_dist(generator, p); +} + double Rand::uniform() { return uniform_dist(generator); } diff --git a/tick/random/src/rand.h b/tick/random/src/rand.h index 79efa5fab..6fbf0db02 100644 --- a/tick/random/src/rand.h +++ b/tick/random/src/rand.h @@ -28,6 +28,7 @@ class DLL_PUBLIC Rand { std::uniform_int_distribution uniform_int_dist; std::uniform_int_distribution uniform_ulong_dist; + std::uniform_int_distribution uniform_uint32_dist; std::uniform_real_distribution uniform_dist; std::normal_distribution normal_dist; std::exponential_distribution expon_dist; @@ -78,6 +79,13 @@ class DLL_PUBLIC Rand { */ ulong uniform_int(ulong a, ulong b); + /** + * @brief Returns a random integer between two number (both can be reached) + * \param a : lower bound + * \param b : upper bound + */ + uint32_t uniform_int(uint32_t a, uint32_t b); + /** * @brief Returns a random real between 0 and 1 */ diff --git a/video.py b/video.py new file mode 100644 index 000000000..5de85343b --- /dev/null +++ b/video.py @@ -0,0 +1,61 @@ + +import matplotlib.animation as animation +from matplotlib.animation import MovieWriter + +from sklearn.model_selection import train_test_split +import numpy as np +from tick.inference import OnlineForestClassifier +from sklearn.datasets import make_moons, make_classification, make_circles +from sklearn.metrics import roc_auc_score +import matplotlib.pyplot as plt + +n_samples = 500 +n_features = 2 +seed = 123 + +X, y = make_moons(n_samples=n_samples, noise=0.3, random_state=0) + +X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.5, random_state=42) + +h = .1 +x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 +y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 +xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) +Z = np.zeros(xx.shape) + +cm = plt.cm.RdBu + +fig = plt.figure(figsize=(5, 5)) +ax = plt.subplot(1, 1, 1) + + +ax.set_xlim(xx.min(), xx.max()) +ax.set_ylim(yy.min(), yy.max()) +ax.set_xticks(()) +ax.set_yticks(()) + +ax.scatter(X_train[:2, 0], X_train[:2, 1], c=np.array([0, 1]), s=25, cmap=cm) + +n_trees = 20 + +clf = OnlineForestClassifier(n_classes=2, n_trees=n_trees, seed=123, step=1., + use_aggregation=False) + + +def animate(i): + clf.fit(X_train[i, :].reshape(1, 2), np.array([y_train[i]])) + Z = clf.predict_proba(np.array([xx.ravel(), yy.ravel()]).T)[:, 1] + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.5) + ax.scatter(X_train[:i, 0], X_train[:i, 1], c=y_train[:i], s=25, cmap=cm) + score = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) + fig.suptitle('test auc: %.2f' % score, fontsize=18) + return ax + +# Interval in seconds +interval = 10 +ani = animation.FuncAnimation(fig, animate, 200, interval=interval) + +plt.show()