Imbalanced Data Transform

We have a simple solution for imbalanced classes in a classification problem. Based on the imbalanced-learn package, you can choose between over-, under- and combinesampling. Have a look at the Developer Website for details about the balancing data algorithms.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import warnings
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.datasets import fetch_datasets

from photonai.base import Hyperpipe, PipelineElement
from photonai.optimization import Categorical

# Since we test very imbalanced data, we want to ignore some metric based zero-divisions.
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# example of imbalanced dataset
dataset = fetch_datasets()['coil_2000']
X, y = dataset.data, dataset.target
# ratio class 0: 6%, class 1: 94%

my_pipe = Hyperpipe('balancing_pipe',
                    optimizer='grid_search',
                    metrics=['accuracy', 'precision', 'recall', 'f1_score'],
                    best_config_metric='f1_score',
                    outer_cv=StratifiedKFold(n_splits=3),
                    inner_cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2),
                    verbosity=1,
                    project_folder='./tmp/')


# ADD ELEMENTS TO YOUR PIPELINE
my_pipe += PipelineElement('StandardScaler')

tested_methods = Categorical(['RandomOverSampler', 'SMOTEENN', 'SVMSMOTE',
                              'BorderlineSMOTE', 'SMOTE'])

# Only SMOTE got a different input parameter.
# All other strategies stay with the default setting.
# Please do not try to optimize over this parameter (not use config inside the 'hyperparameters').
my_pipe += PipelineElement('ImbalancedDataTransformer',
                           hyperparameters={'method_name': tested_methods},
                           config={"SMOTE": {"k_neighbors": 3}},
                           test_disabled=True)

my_pipe += PipelineElement("RandomForestClassifier", n_estimators=200)

# NOW TRAIN YOUR PIPELINE
my_pipe.fit(X, y)

# Possible values for method_name:
# imbalance_type = OVERSAMPLING:
#        - ADASYN
#        - BorderlineSMOTE
#        - KMeansSMOTE
#        - RandomOverSampler
#        - SMOTE
#        - SMOTENC
#        - SVMSMOTE
#
# imbalance_type = UNDERSAMPLING:
#        - ClusterCentroids,
#        - RandomUnderSampler,
#        - NearMiss,
#        - InstanceHardnessThreshold,
#        - CondensedNearestNeighbour,
#        - EditedNearestNeighbours,
#        - RepeatedEditedNearestNeighbours,
#        - AllKNN,
#        - NeighbourhoodCleaningRule,
#        - OneSidedSelection
#
# imbalance_type = COMBINE:
#        - SMOTEENN,
#        - SMOTETomek