|
| 1 | + |
| 2 | +import pandas as pd |
| 3 | +import numpy as np |
| 4 | +import cv2 |
| 5 | +import lightgbm as lgbm |
| 6 | +import os |
| 7 | +os.environ['CUDA_VISIBLE_DEVICES']='5' |
| 8 | +from keras.applications import * |
| 9 | +from imagecleanup2 import cleanupImage as cleanupImage2 |
| 10 | +#import lightgbm as lgbm |
| 11 | + |
| 12 | +r50 = ResNet50(weights='imagenet',include_top=False,input_shape=(224,224,3)) |
| 13 | +#xc299 = Xception(include_top=True, weights='imagenet', input_shape=(299,299,3)) |
| 14 | +#xc299nf = Xception(include_top=False, weights='imagenet', input_shape=(299,299,3)) |
| 15 | +#incv3 = InceptionV3(include_top=True, weights='imagenet', input_shape=(299,299,3)) |
| 16 | +#vgg16nf = VGG16(weights='imagenet',include_top=True,input_shape=(224,224,3)) |
| 17 | +vgg16 = VGG16(weights='imagenet',include_top=True,input_shape=(224,224,3)) |
| 18 | +#vgg19nf = VGG19(weights='imagenet',include_top=True,input_shape=(224,224,3)) |
| 19 | +vgg19 = VGG19(weights='imagenet',include_top=True,input_shape=(224,224,3)) |
| 20 | + |
| 21 | +df = pd.read_csv('Train.csv',sep=';') |
| 22 | +df['lin_mass'] = np.power(10, df.logMstar) |
| 23 | +df['lin_err'] = df.lin_mass * np.log(10) * df.err_logMstar |
| 24 | + |
| 25 | +df = df[df.logMstar!=-99] |
| 26 | +df = df[df.err_logMstar!=0] |
| 27 | +np.random.seed(0) |
| 28 | + |
| 29 | +N=len(df.SDSS_ID.values) |
| 30 | +M=N-4000 |
| 31 | +ids = df.SDSS_ID.values[:N] |
| 32 | +print(len(ids)) |
| 33 | +Y = df.logMstar.values[:N] |
| 34 | +err = df.err_logMstar.values[:N] |
| 35 | +Y_lin = df.lin_mass.values[:N] |
| 36 | +err_lin = df.lin_err.values[:N] |
| 37 | + |
| 38 | +gids = ['Train/'+str(id)+'-g.csv' for id in ids] |
| 39 | +print(len(gids)) |
| 40 | + |
| 41 | +print('loading Xg') |
| 42 | +Xg = np.load('Ximg.npy') |
| 43 | + |
| 44 | +print(Xg.shape) |
| 45 | +print(np.min(Xg),np.max(Xg)) |
| 46 | + |
| 47 | +print('reshaping') |
| 48 | +Xg3 = np.zeros((N,224,224,3)) |
| 49 | +Xg3[:,:,:,:] = Xg.reshape(N,224,224,1) |
| 50 | +print(np.min(Xg3),np.max(Xg3)) |
| 51 | + |
| 52 | +print('r50') |
| 53 | +Xg3r50 = r50.predict(Xg3).reshape(N,2048) |
| 54 | +print('vgg16') |
| 55 | +Xg3vgg16 = vgg16.predict(Xg3) |
| 56 | +print('vgg19') |
| 57 | +Xg3vgg19 = vgg19.predict(Xg3) |
| 58 | +print('done') |
| 59 | + |
| 60 | +print('Features X g band 3 ch features') |
| 61 | + |
| 62 | +Distance = df.Distance.values[:N].reshape(N,1) |
| 63 | + |
| 64 | +csize = 2 |
| 65 | + |
| 66 | +Xg3f = np.hstack ( ( |
| 67 | + Xg3r50, |
| 68 | + Xg3vgg16, |
| 69 | +# Xg3vgg19, |
| 70 | + Distance, |
| 71 | + 1/Distance, |
| 72 | + Distance**2, |
| 73 | + 1/(Distance**2), |
| 74 | + Distance**3, |
| 75 | + 1/(Distance**3), |
| 76 | + np.log(Distance), |
| 77 | + 1/np.log(Distance), |
| 78 | + np.log(Distance**2), |
| 79 | + 1/np.log(Distance**2), |
| 80 | + np.log(Distance)**2, |
| 81 | + 1/np.log(Distance)**2, |
| 82 | + np.sum(Xg3.reshape(N,-1),axis=1).reshape(N,1), |
| 83 | + np.min(Xg3.reshape(N,-1),axis=1).reshape(N,1), |
| 84 | + np.max(Xg3.reshape(N,-1),axis=1).reshape(N,1), |
| 85 | + np.mean(Xg3.reshape(N,-1),axis=1).reshape(N,1), |
| 86 | + np.std(Xg3.reshape(N,-1),axis=1).reshape(N,1), |
| 87 | + Xg3[:,112,112,0].reshape(N,1), # center |
| 88 | + np.mean(Xg3[:,112-csize:112+csize,112-csize:112+csize,0].reshape(N,-1),axis=1).reshape(N,-1), # mean center |
| 89 | +) ) |
| 90 | + |
| 91 | + |
| 92 | +print(Xg3r50.shape) |
| 93 | +print(Xg3vgg16.shape) |
| 94 | +print(Xg3f.shape) |
| 95 | + |
| 96 | +np.save('Xg3f',Xg3f) |
| 97 | + |
| 98 | +dtrain = lgbm.Dataset(Xg3f[:M], label= Y[:M]) |
| 99 | +dtest = lgbm.Dataset(Xg3f[M:], label= Y[M:]) |
| 100 | + |
| 101 | + |
| 102 | +lgbm_params = { |
| 103 | + 'boosting_type': 'gbdt', |
| 104 | + 'objective': 'regression_l2', |
| 105 | + 'nthread': 35, |
| 106 | + 'silent': True, |
| 107 | + 'num_leaves': 2**4, |
| 108 | + 'learning_rate': 0.05, |
| 109 | + 'max_depth': 10, |
| 110 | + 'max_bin': 255, |
| 111 | + #'subsample_for_bin': 50000, |
| 112 | + #'subsample': 0.8, |
| 113 | + #'subsample_freq': 1, |
| 114 | + #'colsample_bytree': 0.8, |
| 115 | + #'reg_alpha': 1, |
| 116 | + #'reg_lambda': 0, |
| 117 | + #'min_split_gain': 0.5, |
| 118 | + #'min_child_weight': 1, |
| 119 | + #'min_child_samples': 60, |
| 120 | + #'scale_pos_weight': 1, |
| 121 | + #'device' : 'gpu', |
| 122 | + 'metric' : 'rmse', |
| 123 | + #'eval_metric' : 'rmse', |
| 124 | + #'metric' : 'multi_error', |
| 125 | + 'verbose':0, |
| 126 | +} |
| 127 | + |
| 128 | +bst = lgbm.cv(lgbm_params, dtrain, num_boost_round=10000, data_splitter=None, nfold=3, stratified=False, shuffle=True, |
| 129 | + metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto', |
| 130 | + categorical_feature='auto', early_stopping_rounds=200, fpreproc=None, |
| 131 | + verbose_eval=10, show_stdv=True, seed=0, callbacks=None) |
| 132 | + |
| 133 | +num_boost_round = len(bst['rmse-mean'])-1 |
| 134 | +print(num_boost_round) |
| 135 | + |
| 136 | + |
| 137 | +model = lgbm.train(lgbm_params, dtrain, num_boost_round, |
| 138 | + valid_sets=[dtest], valid_names=['test'], fobj=None, feval=None, |
| 139 | + init_model=None, feature_name='auto', categorical_feature='auto', |
| 140 | + early_stopping_rounds=num_boost_round, evals_result=None, verbose_eval=10, |
| 141 | + learning_rates=None, callbacks=None) |
| 142 | + |
| 143 | +pred = model.predict(Xg3f[M:]) |
| 144 | + |
| 145 | +def xi2(true,pred,error): |
| 146 | + s=np.mean((true-pred)**2/error**2) |
| 147 | + return s |
| 148 | + |
| 149 | +print('xi2',xi2(Y[M:],pred,err[M:])) |
| 150 | +xi2lin = xi2(10**Y[M:],10**pred,err_lin[M:]) |
| 151 | +print('xi2lin',xi2lin) |
| 152 | + |
| 153 | +model.save_model('lgbm'+str(xi2lin), num_iteration=-1) |
| 154 | + |
| 155 | +### TEST |
| 156 | + |
| 157 | +df_test = pd.read_csv('Test_Distance.csv',sep=';') |
| 158 | +#df_valid = pd.read_csv('validationdata_SDSSID.csv',sep=';') |
| 159 | +#ids = df_test[df_test.SDSS_ID.isin(df_valid.SDSS_ID)]['SDSS_ID'] |
| 160 | +ids = df_test.SDSS_ID |
| 161 | + |
| 162 | +gids = ['Test/'+str(id)+'-g.csv' for id in ids] |
| 163 | + |
| 164 | +Xg_,Xi_ = [],[] |
| 165 | +for i in range(len(ids)): |
| 166 | + Xg = np.genfromtxt (gids[i], delimiter=",") |
| 167 | + Xg = cleanupImage2(Xg) |
| 168 | + Xg -= np.mean(Xg) |
| 169 | + Xg /= np.std(Xg) |
| 170 | + h,w = Xg.shape |
| 171 | + cy, cx = h//2, w//2 |
| 172 | + dy, dx = cy//2, cx//2 |
| 173 | + Xg = Xg[cy-dy:cy+dy,cx-dx:cx+dx] |
| 174 | + Xgr = cv2.resize(Xg,(224,224),cv2.INTER_AREA) |
| 175 | + Xg_.append(Xgr) |
| 176 | + if i%10==0: |
| 177 | + print(i,end=' ',flush=True) |
| 178 | + |
| 179 | +N = len(ids) |
| 180 | +Xg = np.stack(Xg_) |
| 181 | +Xg3 = np.zeros((N,224,224,3)) |
| 182 | +Xg3[:,:,:,:] = Xg.reshape(N,224,224,1) |
| 183 | +print('r50') |
| 184 | +Xg3r50 = r50.predict(Xg3).reshape(N,2048) |
| 185 | +print('vgg16') |
| 186 | +Xg3vgg16 = vgg16.predict(Xg3) |
| 187 | +#Xg3vgg19 = vgg19.predict(Xg3) |
| 188 | +#Distance = df_test[df_test.SDSS_ID.isin(ids)].Distance.values.reshape(N,1) |
| 189 | +Distance = df_test.Distance.values.reshape(N,1) |
| 190 | + |
| 191 | +csize = 2 |
| 192 | + |
| 193 | +Xg3f = np.hstack ( ( |
| 194 | + Xg3r50, |
| 195 | + Xg3vgg16, |
| 196 | +# Xg3vgg19, |
| 197 | + Distance, |
| 198 | + 1/Distance, |
| 199 | + Distance**2, |
| 200 | + 1/(Distance**2), |
| 201 | + Distance**3, |
| 202 | + 1/(Distance**3), |
| 203 | + np.log(Distance), |
| 204 | + 1/np.log(Distance), |
| 205 | + np.log(Distance**2), |
| 206 | + 1/np.log(Distance**2), |
| 207 | + np.log(Distance)**2, |
| 208 | + 1/np.log(Distance)**2, |
| 209 | + np.sum(Xg3.reshape(N,-1),axis=1).reshape(N,1), |
| 210 | + np.min(Xg3.reshape(N,-1),axis=1).reshape(N,1), |
| 211 | + np.max(Xg3.reshape(N,-1),axis=1).reshape(N,1), |
| 212 | + np.mean(Xg3.reshape(N,-1),axis=1).reshape(N,1), |
| 213 | + np.std(Xg3.reshape(N,-1),axis=1).reshape(N,1), |
| 214 | + Xg3[:,112,112,0].reshape(N,1), # center |
| 215 | + np.mean(Xg3[:,112-csize:112+csize,112-csize:112+csize,0].reshape(N,-1),axis=1).reshape(N,-1), # mean center |
| 216 | + |
| 217 | + ) ) |
| 218 | + |
| 219 | + |
| 220 | +del Xg3r50 |
| 221 | +del Xg3vgg16 |
| 222 | +del Xg3 |
| 223 | + |
| 224 | +dtest_final = lgbm.Dataset(Xg3f) |
| 225 | +pred = model.predict(Xg3f) |
| 226 | +df_sub = pd.DataFrame({'pssid':ids, 'mass':pred}, columns=['pssid', 'mass']) |
| 227 | +df_sub.to_csv('submission_gold_'+str(xi2lin)+'.csv', index=False) |
| 228 | + |
0 commit comments