MachineLearningUtilities/modeling.py at master · brmagnuson/MachineLearningUtilities · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import numpy
import sklearn.grid_search
import sklearn.cross_validation
import mlutilities.types


def tuneModel(dataSet, tuneModelConfiguration, randomSeed=None, n_jobs=1):
    """
    Finds the best combination of a set of model parameters for a DataSet.
    :param dataSet: Presumes that the last column in nonFeaturesDataFrame is the label.
    :param modelCreationConfiguration:
    :return: TuneModelResult
    """
    # Get features and label from dataSet
    features = dataSet.featuresDataFrame
    label = dataSet.labelSeries

    # Grid search to find best parameters.
    gridSearchPredictor = sklearn.grid_search.GridSearchCV(tuneModelConfiguration.modellingMethod.function(),
                                                           tuneModelConfiguration.parameterGrid,
                                                           scoring=tuneModelConfiguration.scoreMethod,
                                                           cv=sklearn.cross_validation.KFold(len(label),
                                                                                             n_folds=5,
                                                                                             shuffle=True,
                                                                                             random_state=randomSeed),
                                                           refit=False,
                                                           n_jobs=n_jobs)
    gridSearchPredictor.fit(features, label)

    # GridSearchCV returns negative scores for loss functions (like MSE) so that highest score is best, so this
    # must be corrected for reporting
    if tuneModelConfiguration.scoreMethod == 'mean_squared_error':
        bestScore = -gridSearchPredictor.best_score_
    else:
        bestScore = gridSearchPredictor.best_score_

    # Create new TunedModelConfiguration object
    tuneModelResult = mlutilities.types.TuneModelResult('Tuned ' + tuneModelConfiguration.description \
                                                            + ' for DataSet: ' + dataSet.description,
                                                        dataSet,
                                                        tuneModelConfiguration.modellingMethod,
                                                        gridSearchPredictor.best_params_,
                                                        tuneModelConfiguration.scoreMethod,
                                                        bestScore,
                                                        gridSearchPredictor.grid_scores_)
    return tuneModelResult


def tuneModels(dataSets, tuneModelConfigurations, randomSeed=None):
    """
    Wrapper function to loop through multiple data sets and model creation configurations
    :param dataSets:
    :param modelCreationConfigurations:
    :return: list of TuneModelResults
    """
    tuneModelResults = []
    counter = 1
    total = len(dataSets) * len(tuneModelConfigurations)
    for dataSet in dataSets:
        for tuneModelConfiguration in tuneModelConfigurations:
            print('Tuning (%s of %s):' % (counter, total), tuneModelConfiguration.description, 'for', dataSet.description)
            tuneModelResult = tuneModel(dataSet, tuneModelConfiguration, randomSeed)
            tuneModelResults.append(tuneModelResult)
            counter += 1
    return tuneModelResults


def applyModel(applyModelConfiguration):
    """
    Given a model, its parameters, a training set, and a test set, train the model and apply to the test data
    :param applyModelConfiguration
    :return: ApplyModelResult
    """
    # Get features and label from DataSets
    trainFeatures = applyModelConfiguration.trainDataSet.featuresDataFrame
    trainLabel = applyModelConfiguration.trainDataSet.labelSeries
    testFeatures = applyModelConfiguration.testDataSet.featuresDataFrame

    # Train model on training set
    predictor = applyModelConfiguration.modellingMethod.function(**applyModelConfiguration.parameters)
    predictor.fit(trainFeatures, trainLabel)

    # Predict for testing set
    testPredictions = predictor.predict(testFeatures)

    # Build ApplyModelResult
    applyModelResult = mlutilities.types.ApplyModelResult(applyModelConfiguration.description.replace('Apply', 'Result:'),
                                                          testPredictions,
                                                          applyModelConfiguration.testDataSet,
                                                          applyModelConfiguration.modellingMethod,
                                                          applyModelConfiguration.parameters)
    return applyModelResult


def applyModels(applyModelConfigurations, subTaskPrint=True):
    """
    Wrapper function to loop through multiple ApplyModelConfigurations
    :param applyModelConfigurations:
    :return: list of ApplyModelResults
    """
    applyModelResults = []
    counter = 1
    total = len(applyModelConfigurations)
    for applyModelConfiguration in applyModelConfigurations:
        if subTaskPrint:
            print('Applying (%s of %s):' % (counter, total), applyModelConfiguration.description)
        applyModelResult = applyModel(applyModelConfiguration)
        applyModelResults.append(applyModelResult)
        counter += 1
    return applyModelResults


def scoreModel(applyModelResult, modelScoreMethods):
    """
    Scores the result of applying a model based on various sklearn.metrics scoring methods
    :param applyModelResult:
    :param scoringFunction: list of ModelScoreMethods
    :return: ScoreModelResult
    """
    testLabel = applyModelResult.testDataSet.labelSeries
    testPredictions = applyModelResult.testPredictions

    modelScores = []
    for modelScoreMethod in modelScoreMethods:
        score = modelScoreMethod.function(testLabel, testPredictions)
        modelScore = mlutilities.types.ModelScore(score, modelScoreMethod)
        modelScores.append(modelScore)

    scoreModelResult = mlutilities.types.ScoreModelResult(applyModelResult.description + ', Test Score',
                                                          applyModelResult.modellingMethod,
                                                          applyModelResult.parameters,
                                                          modelScores)
    return scoreModelResult


def scoreModels(applyModelResults, modelScoreMethods):
    """
    Wrapper function to loop through multiple ApplyModelResult objects
    :param applyModelResults:
    :param scoringFunction:
    :return: list of ScoreModelResults
    """
    scoreModelResults = []
    for applyModelResult in applyModelResults:
        scoreModelResult = scoreModel(applyModelResult, modelScoreMethods)
        scoreModelResults.append(scoreModelResult)
    return scoreModelResults


def meanObservedExpectedScore(y_true, y_pred):

    """
    Function modeled after sklearn.metrics scoring functions to calculate mean O/E score for a model.
    Works when y_true and y_pred are vector-like pandas DataFrames or numpy Arrays.
    Will ignore any NaN or inf values.
    """
    oeRatios = y_true / y_pred
    meanOE = numpy.nanmean(oeRatios[~ numpy.isinf(oeRatios)])
    return float(meanOE)


def sdObservedExpectedScore(y_true, y_pred):
    """
    Function modeled after sklearn.metrics scoring functions to calculate the standard deviation of the O/E score for a
    model.
    Works when y_true and y_pred are vector-like pandas DataFrames or numpy Arrays.
    Will ignore any NaN or inf values.
    """
    oeRatios = y_true / y_pred
    standardDeviationOE = numpy.nanstd(oeRatios[~ numpy.isinf(oeRatios)])
    return float(standardDeviationOE)