From fe7d9e51fe862fc17623724b9305213d9856e3ce Mon Sep 17 00:00:00 2001 From: sophiearana Date: Tue, 5 Jun 2018 13:14:18 +0200 Subject: [PATCH 01/11] starting to work on shifting preprocessing to within cv --- trainer/lightgbm_main.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/trainer/lightgbm_main.py b/trainer/lightgbm_main.py index dbbc6e0..11fa966 100644 --- a/trainer/lightgbm_main.py +++ b/trainer/lightgbm_main.py @@ -87,10 +87,15 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, # Run k-fold cross-validation logging.info('Running cross validation...') skf = StratifiedKFold(n_splits=n_splits, random_state=1) - scores = cross_val_score(gbm, training_data[predictors].values, + for train_index, test_index in skf.split(X, y): + # print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index) + X_train, X_test = training_data[train_index], training_data[test_index] + y_train, y_test = y[train_index], y[test_index] + + gbm = lgb_train (gbm, training_data[predictors].values, training_data[target].values, - scoring='roc_auc', cv=skf, n_jobs=1, verbose=1, - fit_params=fit_params) + scoring='roc_auc') + score =eval(gbm,) return scores.mean() From 429600440faff9e0d2a55c67b39d3849052b8c8e Mon Sep 17 00:00:00 2001 From: sophiearana Date: Tue, 5 Jun 2018 16:02:18 +0200 Subject: [PATCH 02/11] integrate preprocessing within cv --- trainer/lightgbm_main.py | 57 +++++++++++++++++++++------------------- trainer/preprocessing.py | 2 +- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/trainer/lightgbm_main.py b/trainer/lightgbm_main.py index 11fa966..1517fa0 100644 --- a/trainer/lightgbm_main.py +++ b/trainer/lightgbm_main.py @@ -25,11 +25,13 @@ import lightgbm as lgb import pandas as pd +import numpy as np from trainer.cross_validation import cross_val_score from sklearn.model_selection import StratifiedKFold import trainer.lightgbm_functions as lf import trainer.preprocessing as pp +from sklearn.metrics import roc_auc_score # Default parameters @@ -76,27 +78,34 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, 'categorical_feature': categorical_features, # 'callbacks': [lgb.print_evaluation(period=10)] } - - # If we're given some validation data, we can use it for early stopping - if validation_data is not None: - fit_params['eval_set'] = [(validation_data[predictors].values, - validation_data[target].values)] - fit_params['early_stopping_rounds'] = early_stopping_rounds - fit_params['eval_metric'] = 'auc' # Run k-fold cross-validation logging.info('Running cross validation...') + scores = [] skf = StratifiedKFold(n_splits=n_splits, random_state=1) - for train_index, test_index in skf.split(X, y): + for train_index, test_index in skf.split(np.zeros(training_data.shape[0]), training_data[target]): + fold = 1 # print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index) - X_train, X_test = training_data[train_index], training_data[test_index] - y_train, y_test = y[train_index], y[test_index] - - gbm = lgb_train (gbm, training_data[predictors].values, - training_data[target].values, - scoring='roc_auc') - score =eval(gbm,) - + train = training_data.iloc[train_index] + test = training_data.iloc[test_index] + train_df = pp.preprocess_confidence(pp.preprocess_common(train)) + test_df = pp.preprocess_confidence(pp.preprocess_common(test)) + valid_df = pp.preprocess_confidence(pp.preprocess_common(validation_data)) + + # If we're given some validation data, we can use it for early stopping + if validation_data is not None: + fit_params['eval_set'] = [(valid_df[predictors].values, + valid_df[target].values)] + fit_params['early_stopping_rounds'] = early_stopping_rounds + fit_params['eval_metric'] = 'auc' + + gbm = lgb_train(lgb_params, train_df, predictors, target, + categorical_features=categorical,validation_data=validation_data) + + y_hat = gbm.predict(test_df[predictors].values) + score = roc_auc_score(test[target].values, y_hat) + print("fold=%d, auc: %.2f%%" % (fold, score)) + scores.append(score) return scores.mean() @@ -141,21 +150,15 @@ def main(): logging.info('Preprocessing...') # Load training data set, i.e. "the 90%" - train_df = pp.load_train(args.train_file) - - valid_df = None - test_df = None + train_df = pp.load_train_raw(args.train_file) # Load validation data set, i.e. "the 10%" if args.valid_file is not None: - valid_df = pp.load_train(args.valid_file) - train_df, valid_df = pp.preprocess_confidence(train_df, valid_df) - + valid_df = pp.load_train_raw(args.valid_file) # Load the test data set, i.e. data for which we need to make predictions if args.test_file is not None: - test_df = pp.load_test(args.test_file) - train_df, test_df = pp.preprocess_confidence(train_df, test_df) - + test_df = pp.load_test_raw(args.test_file) + # Column we're trying to predict target = 'is_attributed' @@ -211,7 +214,7 @@ def main(): json.dump(lgb_params, param_file) # Make predictions and save to file - if test_df is not None: + if args.test_df is not None: logging.info('Making predictions...') predictions = gbm.predict(test_df[pp.predictors]) predictions_file = path.join(args.job_dir, 'predictions.csv') diff --git a/trainer/preprocessing.py b/trainer/preprocessing.py index 7066525..44f9862 100644 --- a/trainer/preprocessing.py +++ b/trainer/preprocessing.py @@ -44,7 +44,7 @@ 'count_ip_hh_app', 'count_ip_hour_device'] -def _preprocess_common(df): +def preprocess_common(df): """ Data transformations that should be done to both training and test data. """ From c0e717ea0d66d07505e7bb8226e6ca3112d91096 Mon Sep 17 00:00:00 2001 From: sophiearana Date: Tue, 5 Jun 2018 18:35:09 +0200 Subject: [PATCH 03/11] saving recent changes --- trainer/lightgbm_main.py | 7 +++---- trainer/preprocessing.py | 21 ++++++++++++++++++--- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/trainer/lightgbm_main.py b/trainer/lightgbm_main.py index a31a139..cc0e55e 100644 --- a/trainer/lightgbm_main.py +++ b/trainer/lightgbm_main.py @@ -88,9 +88,8 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, # print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index) train = training_data.iloc[train_index] test = training_data.iloc[test_index] - train_df = pp.preprocess_confidence(pp.preprocess_common(train)) - test_df = pp.preprocess_confidence(pp.preprocess_common(test)) - valid_df = pp.preprocess_confidence(pp.preprocess_common(validation_data)) + train_df, test_df, valid_df = pp.preprocess_confidence(pp.preprocess_common(train), pp.preprocess_common(test), + pp.preprocess_common(validation_data)) # If we're given some validation data, we can use it for early stopping if validation_data is not None: @@ -100,7 +99,7 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, fit_params['eval_metric'] = 'auc' gbm = lgb_train(lgb_params, train_df, predictors, target, - categorical_features=categorical,validation_data=validation_data) + categorical_features=categorical_features, validation_data=validation_data) y_hat = gbm.predict(test_df[predictors].values) score = roc_auc_score(test[target].values, y_hat) diff --git a/trainer/preprocessing.py b/trainer/preprocessing.py index 0221a8b..8b5b2ce 100644 --- a/trainer/preprocessing.py +++ b/trainer/preprocessing.py @@ -136,7 +136,7 @@ def preprocess_common(df): return( df ) -def preprocess_confidence(train_df, test_df=None): +def preprocess_confidence(train_df, test_df=None, valid_df=None): """ Feature creation that should be done given training data and then merged wiht test data. """ @@ -213,8 +213,23 @@ def rate_calculation(x): on=cols, how='left' ) # replace nans by average of column - test_df = test_df.fillna(test_df.mean()) - return train_df, test_df + test_df = test_df.fillna(train_df.mean()) + + # Perform the merge of new features with validation data set + if valid_df is not None: + valid_df = valid_df.merge( + group_object['is_attributed']. \ + apply(rate_calculation). \ + reset_index(). \ + rename( + index=str, + columns={'is_attributed': new_feature} + )[cols + [new_feature]], + on=cols, how='left' + ) + # replace nans by average of column + valid_df = valid_df.fillna(train_df.mean()) + return train_df, test_df, valid_df def correlation_matrix(df): From 0cc92de6a8d6aad28518d5843075e1ad3b54eae0 Mon Sep 17 00:00:00 2001 From: sophiearana Date: Thu, 7 Jun 2018 08:45:14 +0200 Subject: [PATCH 04/11] change private function from preprocessing script to be accessible outside --- trainer/lightgbm_main.py | 7 +++++-- trainer/preprocessing.py | 8 ++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/trainer/lightgbm_main.py b/trainer/lightgbm_main.py index cc0e55e..7639401 100644 --- a/trainer/lightgbm_main.py +++ b/trainer/lightgbm_main.py @@ -99,13 +99,13 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, fit_params['eval_metric'] = 'auc' gbm = lgb_train(lgb_params, train_df, predictors, target, - categorical_features=categorical_features, validation_data=validation_data) + categorical_features=categorical_features, validation_data=valid_df) y_hat = gbm.predict(test_df[predictors].values) score = roc_auc_score(test[target].values, y_hat) print("fold=%d, auc: %.2f%%" % (fold, score)) scores.append(score) - return scores.mean() + return np.mean(scores) def lgb_train(params, training_data, predictors, target, @@ -190,6 +190,8 @@ def main(): # Train the final model on all data logging.info('Training on all data...') + train_df, _, valid_df = pp.preprocess_confidence(pp.preprocess_common(train_df), None, + pp.preprocess_common(valid_df)) gbm = lgb_train(lgb_params, train_df, pp.predictors, target, categorical_features=pp.categorical, validation_data=valid_df) @@ -213,6 +215,7 @@ def main(): # Make predictions and save to file if args.test_df is not None: + _, test_df, _ = pp.preprocess_confidence(pp.preprocess_common(train_df), pp.preprocess_common(test_df)) logging.info('Making predictions...') predictions = gbm.predict(test_df[pp.predictors]) predictions_file = path.join(args.job_dir, 'predictions.csv') diff --git a/trainer/preprocessing.py b/trainer/preprocessing.py index 8b5b2ce..a37d1e1 100644 --- a/trainer/preprocessing.py +++ b/trainer/preprocessing.py @@ -33,13 +33,13 @@ # Columns our predictions are based on -predictors = ['app', 'device', 'os', 'channel', 'hour', 'hour_sq', +predictors = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'hour_sq', 'count_ip_day_freq_h', 'count_ip_day_hour', 'count_ip_hour_os', 'count_ip_hh_app', 'count_ip_hour_device', 'ip_confRate', 'app_confRate','device_confRate', 'os_confRate', 'channel_confRate', 'app_channel_confRate', 'app_os_confRate', 'app_device_confRate', 'channel_os_confRate', 'channel_device_confRate', 'os_device_confRate'] -categorical = ['app', 'device', 'os', 'channel', 'hour', 'hour_sq', +categorical = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'hour_sq', 'count_ip_day_freq_h', 'count_ip_day_hour', 'count_ip_hour_os', 'count_ip_hh_app', 'count_ip_hour_device'] @@ -259,7 +259,7 @@ def load_train(filename): Reads and preprocesses labeled data from `filename`. This method should be called for both training and validation data. """ - return _preprocess_common(load_train_raw(filename)) + return preprocess_common(load_train_raw(filename)) def load_test(filename): @@ -267,5 +267,5 @@ def load_test(filename): Reads and preprocesses unlabeled data from `filename`. This method should be called for test data preprocessing. """ - return _preprocess_common(load_test_raw(filename)) + return preprocess_common(load_test_raw(filename)) From 19c713ca4c2fcbb068a223f0600d2828da6ce66f Mon Sep 17 00:00:00 2001 From: sophiearana Date: Thu, 7 Jun 2018 11:29:00 +0200 Subject: [PATCH 05/11] add plotting of roc curve --- trainer/lightgbm_main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/trainer/lightgbm_main.py b/trainer/lightgbm_main.py index 7639401..ec45e0a 100644 --- a/trainer/lightgbm_main.py +++ b/trainer/lightgbm_main.py @@ -31,6 +31,7 @@ from sklearn.model_selection import StratifiedKFold import trainer.lightgbm_functions as lf import trainer.preprocessing as pp +import trainer.plotting_functions as myplot from sklearn.metrics import roc_auc_score @@ -102,7 +103,9 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, categorical_features=categorical_features, validation_data=valid_df) y_hat = gbm.predict(test_df[predictors].values) + score = roc_auc_score(test[target].values, y_hat) + myplot.plot_roc_curve(test[target].values, y_hat, score) print("fold=%d, auc: %.2f%%" % (fold, score)) scores.append(score) return np.mean(scores) From 2239624d8526685bf307f9091d50ee2fba269fba Mon Sep 17 00:00:00 2001 From: sophiearana Date: Tue, 12 Jun 2018 08:31:07 +0200 Subject: [PATCH 06/11] adjust for changes in preprocessing --- trainer/lightgbm_main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/trainer/lightgbm_main.py b/trainer/lightgbm_main.py index 4797906..5f75896 100644 --- a/trainer/lightgbm_main.py +++ b/trainer/lightgbm_main.py @@ -92,8 +92,11 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, # print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index) train = training_data.iloc[train_index] test = training_data.iloc[test_index] - train_df, test_df, valid_df = pp.preprocess_confidence(pp.preprocess_common(train), pp.preprocess_common(test), - pp.preprocess_common(validation_data)) + train_df = pp.preprocess_confidence(pp.preprocess_common(train)) + test_df, valid_df = pp.preprocess_confidence(pp.preprocess_common(train), pp.preprocess_common(test), + pp.preprocess_common(validation_data)) + test_df, valid_df = pp.preprocess_confidence(pp.preprocess_common(train), pp.preprocess_common(test), + pp.preprocess_common(validation_data)) # If we're given some validation data, we can use it for early stopping if validation_data is not None: From f8d48b675ceb0f2b97260a20164759faaaa1c464 Mon Sep 17 00:00:00 2001 From: sophiearana Date: Tue, 12 Jun 2018 11:55:33 +0200 Subject: [PATCH 07/11] move validation data outside cv loop, minor fixes, hardcoded lines corresponding to small90 and small10 --- trainer/lightgbm_main.py | 42 +++++++++++++++++++--------------------- trainer/preprocessing.py | 8 ++++---- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/trainer/lightgbm_main.py b/trainer/lightgbm_main.py index 5f75896..7e144ec 100644 --- a/trainer/lightgbm_main.py +++ b/trainer/lightgbm_main.py @@ -61,7 +61,7 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, - categorical_features=None, n_splits=5, early_stopping_rounds=20): + categorical_features=None, n_splits=3, early_stopping_rounds=20): """ Returns the average score after performing cross validation on `training_data` with `n_splits` splits. At each iteration, LightDBM @@ -82,36 +82,34 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, # 'callbacks': [lgb.print_evaluation(period=10)] } + # If we're given some validation data, we can use it for early stopping + if validation_data is not None: + fit_params['eval_set'] = [(validation_data[predictors].values, + validation_data[target].values)] + fit_params['early_stopping_rounds'] = early_stopping_rounds + fit_params['eval_metric'] = 'auc' + # Run k-fold cross-validation logging.info('Running cross validation...') scores = [] skf = StratifiedKFold(n_splits=n_splits, random_state=1) - + fold = 0 + for train_index, test_index in skf.split(np.zeros(training_data.shape[0]), training_data[target]): - fold = 1 + fold = fold + 1 # print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index) - train = training_data.iloc[train_index] - test = training_data.iloc[test_index] - train_df = pp.preprocess_confidence(pp.preprocess_common(train)) - test_df, valid_df = pp.preprocess_confidence(pp.preprocess_common(train), pp.preprocess_common(test), - pp.preprocess_common(validation_data)) - test_df, valid_df = pp.preprocess_confidence(pp.preprocess_common(train), pp.preprocess_common(test), - pp.preprocess_common(validation_data)) - - # If we're given some validation data, we can use it for early stopping - if validation_data is not None: - fit_params['eval_set'] = [(valid_df[predictors].values, - valid_df[target].values)] - fit_params['early_stopping_rounds'] = early_stopping_rounds - fit_params['eval_metric'] = 'auc' + train = pp.preprocess_common(training_data.iloc[train_index]) + test = pp.preprocess_common(training_data.iloc[test_index]) + train_df = pp.preprocess_confidence(train) + test_df = pp.preprocess_confidence(train, test) gbm = lgb_train(lgb_params, train_df, predictors, target, - categorical_features=categorical_features, validation_data=valid_df) + categorical_features=categorical_features, validation_data=validation_data) y_hat = gbm.predict(test_df[predictors].values) - score = roc_auc_score(test[target].values, y_hat) - myplot.plot_roc_curve(test[target].values, y_hat, score) + score = roc_auc_score(test_df[target].values, y_hat) + #myplot.plot_roc_curve(test[target].values, y_hat, score) print("fold=%d, auc: %.2f%%" % (fold, score)) scores.append(score) return np.mean(scores) @@ -158,11 +156,11 @@ def main(): logging.info('Preprocessing...') # Load training data set, i.e. "the 90%" - train_df = pp.load_train_raw(args.train_file) + train_df = pp.load_train_raw(args.train_file, 2699999) # Load validation data set, i.e. "the 10%" if args.valid_file is not None: - valid_df = pp.load_train_raw(args.valid_file) + valid_df = pp.preprocess_confidence(pp.load_train(args.valid_file, 300002)) # Load the test data set, i.e. data for which we need to make predictions if args.test_file is not None: test_df = pp.load_test_raw(args.test_file) diff --git a/trainer/preprocessing.py b/trainer/preprocessing.py index 0bbeb9b..867b720 100644 --- a/trainer/preprocessing.py +++ b/trainer/preprocessing.py @@ -36,13 +36,13 @@ # Columns our predictions are based on predictors = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'hour_sq', - 'count_ip_day_freq_h', 'count_ip_day_hour', 'count_ip_hour_os', + 'count_ip_day_hour', 'count_ip_hour_os', 'count_ip_hh_app', 'count_ip_hour_device', 'ip_confRate', 'app_confRate','device_confRate', 'os_confRate', 'channel_confRate', 'app_channel_confRate', 'app_os_confRate', 'app_device_confRate', 'channel_os_confRate', 'channel_device_confRate', 'os_device_confRate'] categorical = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'hour_sq', - 'count_ip_day_freq_h', 'count_ip_day_hour', 'count_ip_hour_os', + 'count_ip_day_hour', 'count_ip_hour_os', 'count_ip_hh_app', 'count_ip_hour_device'] @@ -255,7 +255,7 @@ def load_train(filename, number_samples=None): """ if number_samples < 0: number_samples = None - return _preprocess_common(load_train_raw(filename, number_samples)) + return preprocess_common(load_train_raw(filename, number_samples)) def load_test(filename, number_samples=None): @@ -265,4 +265,4 @@ def load_test(filename, number_samples=None): """ if number_samples < 0: number_samples = None - return _preprocess_common(load_test_raw(filename, number_samples)) + return preprocess_common(load_test_raw(filename, number_samples)) From 26b45524143211fc278a812b7f232e52954d5497 Mon Sep 17 00:00:00 2001 From: sophiearana Date: Tue, 12 Jun 2018 14:20:46 +0200 Subject: [PATCH 08/11] for plotting roc_auc if interested --- trainer/plotting_functions.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 trainer/plotting_functions.py diff --git a/trainer/plotting_functions.py b/trainer/plotting_functions.py new file mode 100644 index 0000000..6ebf035 --- /dev/null +++ b/trainer/plotting_functions.py @@ -0,0 +1,18 @@ +from sklearn.metrics import roc_curve +import matplotlib.pyplot as plt + + +def plot_roc_curve(y, yhat, roc_auc): + fpr, tpr, _ = roc_curve(y, yhat) + plt.figure() + lw = 2 + plt.plot(fpr, tpr, color='darkorange', + lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) + plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic example') + plt.legend(loc="lower right") + plt.show() From 7d3d814724578613874e8b97efcdcaad26d287fd Mon Sep 17 00:00:00 2001 From: sophiearana Date: Tue, 12 Jun 2018 15:25:07 +0200 Subject: [PATCH 09/11] always take all lines in test data and correct indexing --- trainer/lightgbm_main.py | 6 +++--- trainer/preprocessing.py | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/trainer/lightgbm_main.py b/trainer/lightgbm_main.py index 7e144ec..88b77b7 100644 --- a/trainer/lightgbm_main.py +++ b/trainer/lightgbm_main.py @@ -98,8 +98,8 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None, for train_index, test_index in skf.split(np.zeros(training_data.shape[0]), training_data[target]): fold = fold + 1 # print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index) - train = pp.preprocess_common(training_data.iloc[train_index]) - test = pp.preprocess_common(training_data.iloc[test_index]) + train = pp.preprocess_common(training_data.iloc[train_index, 0:training_data.shape[1]]) + test = pp.preprocess_common(training_data.iloc[test_index, 0:training_data.shape[1]]) train_df = pp.preprocess_confidence(train) test_df = pp.preprocess_confidence(train, test) @@ -190,7 +190,7 @@ def main(): # Run cross-validation logging.info('Cross-validation part...') score = lgb_cv(lgb_params, train_df, pp.predictors, target, - categorical_features=pp.categorical, n_splits=5, + categorical_features=pp.categorical, n_splits=3, validation_data=valid_df) logging.info('Average score across the folds: {}'.format(score)) diff --git a/trainer/preprocessing.py b/trainer/preprocessing.py index 867b720..59f805d 100644 --- a/trainer/preprocessing.py +++ b/trainer/preprocessing.py @@ -258,11 +258,9 @@ def load_train(filename, number_samples=None): return preprocess_common(load_train_raw(filename, number_samples)) -def load_test(filename, number_samples=None): +def load_test(filename): """ Reads and preprocesses unlabeled data from `filename`. This method should be called for test data preprocessing. """ - if number_samples < 0: - number_samples = None - return preprocess_common(load_test_raw(filename, number_samples)) + return preprocess_common(load_test_raw(filename)) From 350b8d6dfdcf4ca6d1ed10c33466e54a3bc6bbc5 Mon Sep 17 00:00:00 2001 From: sophiearana Date: Tue, 12 Jun 2018 15:27:28 +0200 Subject: [PATCH 10/11] delete number_samples parameter from load_test_raw --- trainer/preprocessing.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/trainer/preprocessing.py b/trainer/preprocessing.py index 59f805d..00f4686 100644 --- a/trainer/preprocessing.py +++ b/trainer/preprocessing.py @@ -239,13 +239,12 @@ def load_train_raw(filename, number_samples): nrows=number_samples) -def load_test_raw(filename, number_samples): +def load_test_raw(filename): columns = ['ip','app','device','os', 'channel', 'click_time', 'click_id'] logging.info('Loading unlabeled data from {!r}...'.format(filename)) with open_dispatching(filename, mode='rb') as f: - return pd.read_csv(f, dtype=DTYPES, usecols=columns, - nrows=number_samples) + return pd.read_csv(f, dtype=DTYPES, usecols=columns) def load_train(filename, number_samples=None): From 9d71b51fade5ed65c5f041c3a8141d789a47f752 Mon Sep 17 00:00:00 2001 From: sophiearana Date: Tue, 12 Jun 2018 15:55:00 +0200 Subject: [PATCH 11/11] separate two types of preprocessing --- trainer/lightgbm_main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/trainer/lightgbm_main.py b/trainer/lightgbm_main.py index 88b77b7..16601d5 100644 --- a/trainer/lightgbm_main.py +++ b/trainer/lightgbm_main.py @@ -160,7 +160,8 @@ def main(): # Load validation data set, i.e. "the 10%" if args.valid_file is not None: - valid_df = pp.preprocess_confidence(pp.load_train(args.valid_file, 300002)) + valid_df = pp.load_train(args.valid_file, 300002) + valid_df = pp.preprocess_confidence(valid_df) # Load the test data set, i.e. data for which we need to make predictions if args.test_file is not None: test_df = pp.load_test_raw(args.test_file)