diff --git a/trainer/lightgbm_main.py b/trainer/lightgbm_main.py index 0fd48a6..6853029 100644 --- a/trainer/lightgbm_main.py +++ b/trainer/lightgbm_main.py @@ -25,11 +25,14 @@ import lightgbm as lgb import pandas as pd +import numpy as np from trainer.cross_validation import cross_val_score from sklearn.model_selection import StratifiedKFold import trainer.lightgbm_functions as lf import trainer.preprocessing as pp +import trainer.plotting_functions as myplot +from sklearn.metrics import roc_auc_score # Default parameters @@ -58,7 +61,7 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, - categorical_features=None, n_splits=5, early_stopping_rounds=20): + categorical_features=None, n_splits=3, early_stopping_rounds=20): """ Returns the average score after performing cross validation on `training_data` with `n_splits` splits. At each iteration, LightDBM @@ -88,13 +91,28 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, # Run k-fold cross-validation logging.info('Running cross validation...') + scores = [] skf = StratifiedKFold(n_splits=n_splits, random_state=1) - scores = cross_val_score(gbm, training_data[predictors].values, - training_data[target].values, - scoring='roc_auc', cv=skf, n_jobs=1, verbose=1, - fit_params=fit_params) + fold = 0 - return scores.mean() + for train_index, test_index in skf.split(np.zeros(training_data.shape[0]), training_data[target]): + fold = fold + 1 + # print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index) + train = pp.preprocess_common(training_data.iloc[train_index, 0:training_data.shape[1]]) + test = pp.preprocess_common(training_data.iloc[test_index, 0:training_data.shape[1]]) + train_df = pp.preprocess_confidence(train) + test_df = pp.preprocess_confidence(train, test) + + gbm = lgb_train(lgb_params, train_df, predictors, target, + categorical_features=categorical_features, validation_data=validation_data) + + y_hat = gbm.predict(test_df[predictors].values) + + score = roc_auc_score(test_df[target].values, y_hat) + #myplot.plot_roc_curve(test[target].values, y_hat, score) + print("fold=%d, auc: %.2f%%" % (fold, score)) + scores.append(score) + return np.mean(scores) def lgb_train(params, training_data, predictors, target, @@ -136,27 +154,18 @@ def main(): level=args.log) logging.info('Preprocessing...') - - # Load the training data, i.e. "the 90%" - train_df = pp.load_train(args.train_file, int(args.number_lines) - if args.number_lines is not None else None) - train_df = pp.preprocess_confidence(train_df) - - # Load the validation data, i.e. "the 10%" + + # Load training data set, i.e. "the 90%" + train_df = pp.load_train_raw(args.train_file, 2699999) + + # Load validation data set, i.e. "the 10%" if args.valid_file is not None: - valid_df = pp.load_train(args.valid_file) - valid_df = pp.preprocess_confidence(train_df, valid_df) - else: - valid_df = None - - # Load the test data set, i.e. the data for which we need to make predictions + valid_df = pp.load_train(args.valid_file, 300002) + valid_df = pp.preprocess_confidence(valid_df) + # Load the test data set, i.e. data for which we need to make predictions if args.test_file is not None: - test_df = pp.load_test(args.test_file) - test_df = pp.preprocess_confidence(train_df, test_df) - else: - test_df = None + test_df = pp.load_test_raw(args.test_file) - # Column we're trying to predict target = 'is_attributed' # Provide default hyperparameter values @@ -200,7 +209,7 @@ def main(): # Run cross-validation logging.info('Cross-validation part...') score = lgb_cv(lgb_params, train_df, pp.predictors, target, - categorical_features=pp.categorical, n_splits=5, + categorical_features=pp.categorical, n_splits=3, validation_data=valid_df) logging.info('Average score across the folds: {}'.format(score)) diff --git a/trainer/plotting_functions.py b/trainer/plotting_functions.py new file mode 100644 index 0000000..6ebf035 --- /dev/null +++ b/trainer/plotting_functions.py @@ -0,0 +1,18 @@ +from sklearn.metrics import roc_curve +import matplotlib.pyplot as plt + + +def plot_roc_curve(y, yhat, roc_auc): + fpr, tpr, _ = roc_curve(y, yhat) + plt.figure() + lw = 2 + plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic example') + plt.legend(loc="lower right") + plt.show() diff --git a/trainer/preprocessing.py b/trainer/preprocessing.py index 0a0f37f..00f4686 100644 --- a/trainer/preprocessing.py +++ b/trainer/preprocessing.py @@ -35,15 +35,14 @@ # Columns our predictions are based on -predictors = ['app', 'device', 'os', 'channel', 'hour', 'hour_sq', - 'count_ip_day_hour', 'count_ip_hour_os', 'count_ip_hh_app', - 'count_ip_hour_device', 'ip_confRate', 'app_confRate', - 'device_confRate', 'os_confRate', 'channel_confRate', +predictors = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'hour_sq', + 'count_ip_day_hour', 'count_ip_hour_os', + 'count_ip_hh_app', 'count_ip_hour_device', 'ip_confRate', + 'app_confRate','device_confRate', 'os_confRate', 'channel_confRate', 'app_channel_confRate', 'app_os_confRate', 'app_device_confRate', - 'channel_os_confRate', 'channel_device_confRate', - 'os_device_confRate'] -categorical = ['app', 'device', 'os', 'channel', 'hour', 'hour_sq', - 'count_ip_day_hour', 'count_ip_hour_os', + 'channel_os_confRate', 'channel_device_confRate', 'os_device_confRate'] +categorical = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'hour_sq', + 'count_ip_day_hour', 'count_ip_hour_os', 'count_ip_hh_app', 'count_ip_hour_device'] @@ -55,7 +54,7 @@ def reformat_click_time(df): df.drop(['click_time'], axis=1, inplace=True) -def _preprocess_common(df): +def preprocess_common(df): """ Data transformations that should be done to both training and test data. """ @@ -143,7 +142,7 @@ def rate_calculation(x): return rate * conf -def preprocess_confidence(train_df, test_df=None): +def preprocess_confidence(train_df, test_df=None, valid_df=None): """ Feature creation that should be done given training data and then merged \ with test data. @@ -240,13 +239,12 @@ def load_train_raw(filename, number_samples): nrows=number_samples) -def load_test_raw(filename, number_samples): +def load_test_raw(filename): columns = ['ip','app','device','os', 'channel', 'click_time', 'click_id'] logging.info('Loading unlabeled data from {!r}...'.format(filename)) with open_dispatching(filename, mode='rb') as f: - return pd.read_csv(f, dtype=DTYPES, usecols=columns, - nrows=number_samples) + return pd.read_csv(f, dtype=DTYPES, usecols=columns) def load_train(filename, number_samples=None): @@ -256,14 +254,12 @@ def load_train(filename, number_samples=None): """ if number_samples < 0: number_samples = None - return _preprocess_common(load_train_raw(filename, number_samples)) + return preprocess_common(load_train_raw(filename, number_samples)) -def load_test(filename, number_samples=None): +def load_test(filename): """ Reads and preprocesses unlabeled data from `filename`. This method should be called for test data preprocessing. """ - if number_samples < 0: - number_samples = None - return _preprocess_common(load_test_raw(filename, number_samples)) + return preprocess_common(load_test_raw(filename))