2020from scipy import interp
2121from scipy .stats import randint as sp_randint
2222from scipy .stats import uniform as sp_random
23+ from sklearn .base import clone
2324from sklearn import preprocessing , feature_selection , decomposition
2425from sklearn import cluster , metrics
2526from sklearn import ensemble , neighbors , svm , tree
@@ -897,9 +898,10 @@ def eval_classif_cross_val_roc(clf_name, classif, features, labels,
897898 cross_val = cross_val .split (features , labels )
898899 count = 0.
899900 for train , test in cross_val :
900- classif .fit (np .copy (features [train ], order = 'C' ),
901- np .copy (labels [train ], order = 'C' ))
902- proba = classif .predict_proba (np .copy (features [test ], order = 'C' ))
901+ classif_cv = clone (classif )
902+ classif_cv .fit (np .copy (features [train ], order = 'C' ),
903+ np .copy (labels [train ], order = 'C' ))
904+ proba = classif_cv .predict_proba (np .copy (features [test ], order = 'C' ))
903905 # Compute ROC curve and area the curve
904906 for i , lb in enumerate (unique_labels ):
905907 fpr , tpr , _ = metrics .roc_curve (labels_bin [test , lb ], proba [:, i ])
@@ -934,6 +936,12 @@ def search_params_cut_down_max_nb_iter(clf_parameters, nb_iter):
934936 :param clf_parameters: {str: ...}
935937 :param nb_iter: int, nb of random tryes
936938 :return: int
939+
940+ >>> clf_params = create_clf_param_search_grid(DEFAULT_CLASSIF_NAME)
941+ >>> search_params_cut_down_max_nb_iter(clf_params, 100)
942+ 100
943+ >>> search_params_cut_down_max_nb_iter(clf_params, 1e6)
944+ 1450
937945 """
938946 counts = []
939947 for k in clf_parameters :
@@ -944,7 +952,7 @@ def search_params_cut_down_max_nb_iter(clf_parameters, nb_iter):
944952 return nb_iter
945953 count = np .product (counts )
946954 if count < nb_iter :
947- nb_iter < count
955+ nb_iter = count
948956 return nb_iter
949957
950958
@@ -1384,32 +1392,40 @@ class HoldOut:
13841392
13851393 Example
13861394 -------
1387- >>> ho = HoldOut(10, 7)
1395+ >>> ho = HoldOut(10, 7, rand_seed=None )
13881396 >>> len(ho)
13891397 1
13901398 >>> list(ho)
13911399 [([0, 1, 2, 3, 4, 5, 6], [7, 8, 9])]
1400+ >>> ho = HoldOut(10, 7, rand_seed=0)
1401+ >>> list(ho)
1402+ [([2, 8, 4, 9, 1, 6, 7], [3, 0, 5])]
13921403 """
1393- def __init__ (self , nb , hold_idx , random_state = 0 ):
1404+ def __init__ (self , nb_samples , hold_out , rand_seed = 0 ):
13941405 """
13951406
1396- :param int nb : total number of samples
1397- :param int hold_idx : index where the test starts
1398- :param obj random_state : Seed for the random number generator.
1407+ :param int nb_samples : total number of samples
1408+ :param int hold_out : index where the test starts
1409+ :param obj rand_seed : Seed for the random number generator.
13991410 """
1400- self .total = nb
1401- self .hold_idx = hold_idx
1402- self .random_state = random_state
1403- assert self .total > self .hold_idx , \
1404- 'total %i should be higher than hold Idx %i' % (self .total , self .hold_idx )
1411+ assert nb_samples > hold_out , \
1412+ 'total %i should be higher than hold Idx %i' % (nb_samples , hold_out )
1413+
1414+ self ._total = nb_samples
1415+ self .hold_out = hold_out
1416+ self ._indexes = list (range (nb_samples ))
1417+
1418+ if rand_seed is not None and rand_seed is not False :
1419+ np .random .seed (rand_seed )
1420+ np .random .shuffle (self ._indexes )
14051421
14061422 def __iter__ (self ):
14071423 """ iterate the folds
14081424
14091425 :return ([int], [int]):
14101426 """
1411- ind_train = list ( range ( self .hold_idx ))
1412- ind_test = list ( range ( self .hold_idx , self .total ))
1427+ ind_train = self ._indexes [: self . hold_out ]
1428+ ind_test = self ._indexes [ self .hold_out :]
14131429 yield ind_train , ind_test
14141430
14151431 def __len__ (self ):
@@ -1438,22 +1454,29 @@ class CrossValidatePOut:
14381454 >>> len(cv)
14391455 2
14401456 >>> list(cv) # doctest: +NORMALIZE_WHITESPACE
1441- [([3, 4, 5], [0, 1, 2]), \
1457+ [([3, 4, 5], [0, 1, 2]),
14421458 ([0, 1, 2], [3, 4, 5])]
14431459
14441460 Example 2
14451461 ---------
14461462 >>> cv = CrossValidatePOut(7, 3, rand_seed=0)
14471463 >>> list(cv) # doctest: +NORMALIZE_WHITESPACE
1448- [([3, 0, 5, 4], [6, 2, 1]), \
1449- ([6, 2, 1, 4], [3, 0, 5]), \
1464+ [([3, 0, 5, 4], [6, 2, 1]),
1465+ ([6, 2, 1, 4], [3, 0, 5]),
14501466 ([6, 2, 1, 3, 0, 5], [4])]
1451-
1452-
14531467 >>> len(list(cv))
14541468 3
14551469 >>> cv.indexes
14561470 [6, 2, 1, 3, 0, 5, 4]
1471+
1472+ Example 3
1473+ ---------
1474+ >>> cv = CrossValidatePOut(7, 5, rand_seed=0)
1475+ >>> list(cv) # doctest: +NORMALIZE_WHITESPACE
1476+ [([6, 2], [1, 3, 0, 5, 4]),
1477+ ([1, 3], [6, 2, 0, 5, 4]),
1478+ ([0, 5], [6, 2, 1, 3, 4]),
1479+ ([4], [6, 2, 1, 3, 0, 5])]
14571480 """
14581481
14591482 def __init__ (self , nb_samples , nb_hold_out , rand_seed = None ):
@@ -1464,13 +1487,21 @@ def __init__(self, nb_samples, nb_hold_out, rand_seed=None):
14641487 :param obj rand_seed: int or None
14651488 """
14661489 assert nb_samples > nb_hold_out , \
1467- 'number of holdout has to be smaller then total size'
1468- self .nb_samples = nb_samples
1469- self .nb_hold_out = nb_hold_out
1490+ 'number of holdout has to be smaller then _total size'
1491+ self ._nb_samples = nb_samples
1492+ self ._nb_hold_out = nb_hold_out
1493+
1494+ self ._revert = False # sets the sizes
1495+ if self ._nb_hold_out > (self ._nb_samples / 2. ):
1496+ logging .debug ('WARNING: you are running in reverse mode, '
1497+ 'while using all training examples '
1498+ 'there are much more yield test cases.' )
1499+ self ._nb_hold_out = self ._nb_samples - self ._nb_hold_out
1500+ self ._revert = True
14701501
1471- self .indexes = list (range (self .nb_samples ))
1502+ self .indexes = list (range (self ._nb_samples ))
14721503
1473- if rand_seed is not False :
1504+ if rand_seed is not None and rand_seed is not False :
14741505 np .random .seed (rand_seed )
14751506 np .random .shuffle (self .indexes )
14761507 logging .debug ('sets ordering: %s' , repr (self .indexes ))
@@ -1482,17 +1513,19 @@ def __iter__(self):
14821513
14831514 :return ([int], [int]):
14841515 """
1485- for i in range (0 , self .nb_samples , self .nb_hold_out ):
1486- inds_test = self .indexes [i :i + self .nb_hold_out ]
1516+ for i in range (0 , self ._nb_samples , self ._nb_hold_out ):
1517+ inds_test = self .indexes [i :i + self ._nb_hold_out ]
14871518 inds_train = [i for i in self .indexes if i not in inds_test ]
1519+ if self ._revert :
1520+ inds_train , inds_test = inds_test , inds_train
14881521 yield inds_train , inds_test
14891522
14901523 def __len__ (self ):
14911524 """ number of folds
14921525
14931526 :return int:
14941527 """
1495- return int (np .ceil (self .nb_samples / float (self .nb_hold_out )))
1528+ return int (np .ceil (self ._nb_samples / float (self ._nb_hold_out )))
14961529
14971530
14981531class CrossValidatePSetsOut :
@@ -1513,7 +1546,7 @@ class CrossValidatePSetsOut:
15131546 >>> len(cv)
15141547 2
15151548 >>> list(cv) # doctest: +NORMALIZE_WHITESPACE
1516- [([5, 6, 7, 8, 9], [0, 1, 2, 3, 4]), \
1549+ [([5, 6, 7, 8, 9], [0, 1, 2, 3, 4]),
15171550 ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9])]
15181551
15191552 Example 2
@@ -1522,13 +1555,21 @@ class CrossValidatePSetsOut:
15221555 >>> cv.set_indexes
15231556 [[0, 1], [2, 3], [4], [5, 6], [7]]
15241557 >>> list(cv) # doctest: +NORMALIZE_WHITESPACE
1525- [([2, 3, 5, 6, 7], [4, 0, 1]), \
1526- ([4, 0, 1, 7], [2, 3, 5, 6]), \
1558+ [([2, 3, 5, 6, 7], [4, 0, 1]),
1559+ ([4, 0, 1, 7], [2, 3, 5, 6]),
15271560 ([4, 0, 1, 2, 3, 5, 6], [7])]
15281561 >>> len(cv)
15291562 3
15301563 >>> cv.sets_order
15311564 [2, 0, 1, 3, 4]
1565+
1566+ Example 3
1567+ ---------
1568+ >>> cv = CrossValidatePSetsOut([2, 2, 1, 2, 1, 1], 4, rand_seed=0)
1569+ >>> list(cv) # doctest: +NORMALIZE_WHITESPACE
1570+ [([8, 4], [2, 3, 5, 6, 0, 1, 7]),
1571+ ([2, 3, 5, 6], [8, 4, 0, 1, 7]),
1572+ ([0, 1, 7], [8, 4, 2, 3, 5, 6])]
15321573 """
15331574
15341575 def __init__ (self , set_sizes , nb_hold_out , rand_seed = None ):
@@ -1539,24 +1580,32 @@ def __init__(self, set_sizes, nb_hold_out, rand_seed=None):
15391580 :param obj rand_seed: int or None
15401581 """
15411582 assert len (set_sizes ) > nb_hold_out , \
1542- 'nb of hold out (%i) has to be smaller then total size %i' \
1583+ 'nb of hold out (%i) has to be smaller then _total size %i' \
15431584 % (nb_hold_out , len (set_sizes ))
1544- self .set_sizes = list (set_sizes )
1545- self .total = np .sum (self .set_sizes )
1546- self .nb_hold_out = nb_hold_out
1585+ self ._set_sizes = list (set_sizes )
1586+ self ._total = np .sum (self ._set_sizes )
1587+ self ._nb_hold_out = nb_hold_out
1588+
1589+ self ._revert = False # sets the sizes
1590+ if self ._nb_hold_out > (len (self ._set_sizes ) / 2. ):
1591+ logging .debug ('WARNING: you are running in reverse mode, '
1592+ 'while using all training examples '
1593+ 'there are much more yield test cases.' )
1594+ self ._nb_hold_out = len (self ._set_sizes ) - self ._nb_hold_out
1595+ self ._revert = True
15471596
15481597 self .set_indexes = []
1549- for i , size in enumerate (self .set_sizes ):
1550- start = int (np .sum (self .set_sizes [:i ]))
1598+ for i , size in enumerate (self ._set_sizes ):
1599+ start = int (np .sum (self ._set_sizes [:i ]))
15511600 inds = range (start , start + size )
15521601 self .set_indexes .append (list (inds ))
15531602
1554- assert np .sum (len (i ) for i in self .set_indexes ) == self .total , \
1555- 'all indexes should sum to total count %i' % self .total
1603+ assert np .sum (len (i ) for i in self .set_indexes ) == self ._total , \
1604+ 'all indexes should sum to _total count %i' % self ._total
15561605
1557- self .sets_order = list (range (len (self .set_sizes )))
1606+ self .sets_order = list (range (len (self ._set_sizes )))
15581607
1559- if rand_seed is not False :
1608+ if rand_seed is not None and rand_seed is not False :
15601609 np .random .seed (rand_seed )
15611610 np .random .shuffle (self .sets_order )
15621611 logging .debug ('sets ordering: %s' , repr (self .sets_order ))
@@ -1568,20 +1617,22 @@ def __iter__(self):
15681617
15691618 :return ([int], [int]):
15701619 """
1571- for i in range (0 , len (self .set_sizes ), self .nb_hold_out ):
1572- test = self .sets_order [i :i + self .nb_hold_out ]
1620+ for i in range (0 , len (self ._set_sizes ), self ._nb_hold_out ):
1621+ test = self .sets_order [i :i + self ._nb_hold_out ]
15731622 inds_train = list (itertools .chain .from_iterable (
15741623 self .set_indexes [i ] for i in self .sets_order if i not in test ))
15751624 inds_test = list (itertools .chain .from_iterable (
15761625 self .set_indexes [i ] for i in self .sets_order if i in test ))
1626+ if self ._revert :
1627+ inds_train , inds_test = inds_test , inds_train
15771628 yield inds_train , inds_test
15781629
15791630 def __len__ (self ):
15801631 """ number of folds
15811632
15821633 :return int:
15831634 """
1584- nb = len (self .set_sizes ) / float (self .nb_hold_out )
1635+ nb = len (self ._set_sizes ) / float (self ._nb_hold_out )
15851636 return int (np .ceil (nb ))
15861637
15871638
0 commit comments