Skip to content

Commit 0529dba

Browse files
committed
added full test
1 parent 1a402b2 commit 0529dba

File tree

6 files changed

+73061
-0
lines changed

6 files changed

+73061
-0
lines changed

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,13 @@
11
# astrohack_solution
22
Solution to https://astrohack.org/
3+
4+
In the validation folder there is code + data for the validation phase.
5+
6+
In the full_test folder there is code + data for the full test phase.
7+
8+
The difference between the two is:
9+
10+
- a bug that would filter galaxies with 0 error only on a subset of the data was fixed
11+
- training set size N=full instead of N=10K
12+
- the submission is generated on the full test set instead of just validation
13+

full_test/1_mthread_proc_full.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import pandas as pd
2+
import numpy as np
3+
from imagecleanup2 import cleanupImage as cleanupImage2
4+
import cv2
5+
6+
df = pd.read_csv('Train.csv',sep=';')
7+
df['lin_mass'] = np.power(10, df.logMstar)
8+
df['lin_err'] = df.lin_mass * np.log(10) * df.err_logMstar
9+
10+
df = df[df.logMstar!=-99]
11+
df = df[df.err_logMstar!=0]
12+
13+
N=len(df.SDSS_ID.values)
14+
15+
ids = df.SDSS_ID.values[:N]
16+
17+
18+
print(len(ids))
19+
20+
21+
Y = df.logMstar.values[:N]
22+
err = df.err_logMstar.values[:N]
23+
Y_lin = df.lin_mass.values[:N]
24+
err_lin = df.lin_err.values[:N]
25+
26+
gids = ['Train/'+str(id)+'-g.csv' for id in ids]
27+
28+
def img_preproc(id):
29+
Xg = np.genfromtxt (id, delimiter=",")
30+
Xg = cleanupImage2(Xg)
31+
Xg -= np.mean(Xg)
32+
Xg /= np.std(Xg)
33+
h,w = Xg.shape
34+
cy, cx = h//2, w//2
35+
dy, dx = cy//2, cx//2
36+
Xg = Xg[cy-dy:cy+dy,cx-dx:cx+dx]
37+
Xgr = cv2.resize(Xg,(224,224),cv2.INTER_AREA)
38+
print('.',end='',flush=True)
39+
return Xgr
40+
41+
from joblib import Parallel, delayed
42+
X_ = Parallel(n_jobs=40)(delayed(img_preproc)(i) for i in gids)
43+
44+
X = np.stack(X_)
45+
46+
np.save('Ximg',X)
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
2+
import pandas as pd
3+
import numpy as np
4+
import cv2
5+
import lightgbm as lgbm
6+
import os
7+
os.environ['CUDA_VISIBLE_DEVICES']='5'
8+
from keras.applications import *
9+
from imagecleanup2 import cleanupImage as cleanupImage2
10+
#import lightgbm as lgbm
11+
12+
r50 = ResNet50(weights='imagenet',include_top=False,input_shape=(224,224,3))
13+
#xc299 = Xception(include_top=True, weights='imagenet', input_shape=(299,299,3))
14+
#xc299nf = Xception(include_top=False, weights='imagenet', input_shape=(299,299,3))
15+
#incv3 = InceptionV3(include_top=True, weights='imagenet', input_shape=(299,299,3))
16+
#vgg16nf = VGG16(weights='imagenet',include_top=True,input_shape=(224,224,3))
17+
vgg16 = VGG16(weights='imagenet',include_top=True,input_shape=(224,224,3))
18+
#vgg19nf = VGG19(weights='imagenet',include_top=True,input_shape=(224,224,3))
19+
vgg19 = VGG19(weights='imagenet',include_top=True,input_shape=(224,224,3))
20+
21+
df = pd.read_csv('Train.csv',sep=';')
22+
df['lin_mass'] = np.power(10, df.logMstar)
23+
df['lin_err'] = df.lin_mass * np.log(10) * df.err_logMstar
24+
25+
df = df[df.logMstar!=-99]
26+
df = df[df.err_logMstar!=0]
27+
np.random.seed(0)
28+
29+
N=len(df.SDSS_ID.values)
30+
M=N-4000
31+
ids = df.SDSS_ID.values[:N]
32+
print(len(ids))
33+
Y = df.logMstar.values[:N]
34+
err = df.err_logMstar.values[:N]
35+
Y_lin = df.lin_mass.values[:N]
36+
err_lin = df.lin_err.values[:N]
37+
38+
gids = ['Train/'+str(id)+'-g.csv' for id in ids]
39+
print(len(gids))
40+
41+
print('loading Xg')
42+
Xg = np.load('Ximg.npy')
43+
44+
print(Xg.shape)
45+
print(np.min(Xg),np.max(Xg))
46+
47+
print('reshaping')
48+
Xg3 = np.zeros((N,224,224,3))
49+
Xg3[:,:,:,:] = Xg.reshape(N,224,224,1)
50+
print(np.min(Xg3),np.max(Xg3))
51+
52+
print('r50')
53+
Xg3r50 = r50.predict(Xg3).reshape(N,2048)
54+
print('vgg16')
55+
Xg3vgg16 = vgg16.predict(Xg3)
56+
print('vgg19')
57+
Xg3vgg19 = vgg19.predict(Xg3)
58+
print('done')
59+
60+
print('Features X g band 3 ch features')
61+
62+
Distance = df.Distance.values[:N].reshape(N,1)
63+
64+
csize = 2
65+
66+
Xg3f = np.hstack ( (
67+
Xg3r50,
68+
Xg3vgg16,
69+
# Xg3vgg19,
70+
Distance,
71+
1/Distance,
72+
Distance**2,
73+
1/(Distance**2),
74+
Distance**3,
75+
1/(Distance**3),
76+
np.log(Distance),
77+
1/np.log(Distance),
78+
np.log(Distance**2),
79+
1/np.log(Distance**2),
80+
np.log(Distance)**2,
81+
1/np.log(Distance)**2,
82+
np.sum(Xg3.reshape(N,-1),axis=1).reshape(N,1),
83+
np.min(Xg3.reshape(N,-1),axis=1).reshape(N,1),
84+
np.max(Xg3.reshape(N,-1),axis=1).reshape(N,1),
85+
np.mean(Xg3.reshape(N,-1),axis=1).reshape(N,1),
86+
np.std(Xg3.reshape(N,-1),axis=1).reshape(N,1),
87+
Xg3[:,112,112,0].reshape(N,1), # center
88+
np.mean(Xg3[:,112-csize:112+csize,112-csize:112+csize,0].reshape(N,-1),axis=1).reshape(N,-1), # mean center
89+
) )
90+
91+
92+
print(Xg3r50.shape)
93+
print(Xg3vgg16.shape)
94+
print(Xg3f.shape)
95+
96+
np.save('Xg3f',Xg3f)
97+
98+
dtrain = lgbm.Dataset(Xg3f[:M], label= Y[:M])
99+
dtest = lgbm.Dataset(Xg3f[M:], label= Y[M:])
100+
101+
102+
lgbm_params = {
103+
'boosting_type': 'gbdt',
104+
'objective': 'regression_l2',
105+
'nthread': 35,
106+
'silent': True,
107+
'num_leaves': 2**4,
108+
'learning_rate': 0.05,
109+
'max_depth': 10,
110+
'max_bin': 255,
111+
#'subsample_for_bin': 50000,
112+
#'subsample': 0.8,
113+
#'subsample_freq': 1,
114+
#'colsample_bytree': 0.8,
115+
#'reg_alpha': 1,
116+
#'reg_lambda': 0,
117+
#'min_split_gain': 0.5,
118+
#'min_child_weight': 1,
119+
#'min_child_samples': 60,
120+
#'scale_pos_weight': 1,
121+
#'device' : 'gpu',
122+
'metric' : 'rmse',
123+
#'eval_metric' : 'rmse',
124+
#'metric' : 'multi_error',
125+
'verbose':0,
126+
}
127+
128+
bst = lgbm.cv(lgbm_params, dtrain, num_boost_round=10000, data_splitter=None, nfold=3, stratified=False, shuffle=True,
129+
metrics=None, fobj=None, feval=None, init_model=None, feature_name='auto',
130+
categorical_feature='auto', early_stopping_rounds=200, fpreproc=None,
131+
verbose_eval=10, show_stdv=True, seed=0, callbacks=None)
132+
133+
num_boost_round = len(bst['rmse-mean'])-1
134+
print(num_boost_round)
135+
136+
137+
model = lgbm.train(lgbm_params, dtrain, num_boost_round,
138+
valid_sets=[dtest], valid_names=['test'], fobj=None, feval=None,
139+
init_model=None, feature_name='auto', categorical_feature='auto',
140+
early_stopping_rounds=num_boost_round, evals_result=None, verbose_eval=10,
141+
learning_rates=None, callbacks=None)
142+
143+
pred = model.predict(Xg3f[M:])
144+
145+
def xi2(true,pred,error):
146+
s=np.mean((true-pred)**2/error**2)
147+
return s
148+
149+
print('xi2',xi2(Y[M:],pred,err[M:]))
150+
xi2lin = xi2(10**Y[M:],10**pred,err_lin[M:])
151+
print('xi2lin',xi2lin)
152+
153+
model.save_model('lgbm'+str(xi2lin), num_iteration=-1)
154+
155+
### TEST
156+
157+
df_test = pd.read_csv('Test_Distance.csv',sep=';')
158+
#df_valid = pd.read_csv('validationdata_SDSSID.csv',sep=';')
159+
#ids = df_test[df_test.SDSS_ID.isin(df_valid.SDSS_ID)]['SDSS_ID']
160+
ids = df_test.SDSS_ID
161+
162+
gids = ['Test/'+str(id)+'-g.csv' for id in ids]
163+
164+
Xg_,Xi_ = [],[]
165+
for i in range(len(ids)):
166+
Xg = np.genfromtxt (gids[i], delimiter=",")
167+
Xg = cleanupImage2(Xg)
168+
Xg -= np.mean(Xg)
169+
Xg /= np.std(Xg)
170+
h,w = Xg.shape
171+
cy, cx = h//2, w//2
172+
dy, dx = cy//2, cx//2
173+
Xg = Xg[cy-dy:cy+dy,cx-dx:cx+dx]
174+
Xgr = cv2.resize(Xg,(224,224),cv2.INTER_AREA)
175+
Xg_.append(Xgr)
176+
if i%10==0:
177+
print(i,end=' ',flush=True)
178+
179+
N = len(ids)
180+
Xg = np.stack(Xg_)
181+
Xg3 = np.zeros((N,224,224,3))
182+
Xg3[:,:,:,:] = Xg.reshape(N,224,224,1)
183+
print('r50')
184+
Xg3r50 = r50.predict(Xg3).reshape(N,2048)
185+
print('vgg16')
186+
Xg3vgg16 = vgg16.predict(Xg3)
187+
#Xg3vgg19 = vgg19.predict(Xg3)
188+
#Distance = df_test[df_test.SDSS_ID.isin(ids)].Distance.values.reshape(N,1)
189+
Distance = df_test.Distance.values.reshape(N,1)
190+
191+
csize = 2
192+
193+
Xg3f = np.hstack ( (
194+
Xg3r50,
195+
Xg3vgg16,
196+
# Xg3vgg19,
197+
Distance,
198+
1/Distance,
199+
Distance**2,
200+
1/(Distance**2),
201+
Distance**3,
202+
1/(Distance**3),
203+
np.log(Distance),
204+
1/np.log(Distance),
205+
np.log(Distance**2),
206+
1/np.log(Distance**2),
207+
np.log(Distance)**2,
208+
1/np.log(Distance)**2,
209+
np.sum(Xg3.reshape(N,-1),axis=1).reshape(N,1),
210+
np.min(Xg3.reshape(N,-1),axis=1).reshape(N,1),
211+
np.max(Xg3.reshape(N,-1),axis=1).reshape(N,1),
212+
np.mean(Xg3.reshape(N,-1),axis=1).reshape(N,1),
213+
np.std(Xg3.reshape(N,-1),axis=1).reshape(N,1),
214+
Xg3[:,112,112,0].reshape(N,1), # center
215+
np.mean(Xg3[:,112-csize:112+csize,112-csize:112+csize,0].reshape(N,-1),axis=1).reshape(N,-1), # mean center
216+
217+
) )
218+
219+
220+
del Xg3r50
221+
del Xg3vgg16
222+
del Xg3
223+
224+
dtest_final = lgbm.Dataset(Xg3f)
225+
pred = model.predict(Xg3f)
226+
df_sub = pd.DataFrame({'pssid':ids, 'mass':pred}, columns=['pssid', 'mass'])
227+
df_sub.to_csv('submission_gold_'+str(xi2lin)+'.csv', index=False)
228+

0 commit comments

Comments
 (0)