MachineLearningUtilities/types.py at master · brmagnuson/MachineLearningUtilities · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
import pandas
import numpy
import copy


class DataSet:
    """
    Everything we need to know about a data set to use it in our pipeline. It either reads in
     a csv at a specific file location and creates a Pandas data frame or uses a Pandas data frame
     to write out a csv to a specific file location.
    featuresIndex should be index of column at which your features start, counting from 0. The default assumption is
     that the first column is the label and the second column starts the features.
    """
    def __init__(self, description, path, mode='r', dataFrame=None, featuresIndex=1, labelIndex=0):
        self.description = description
        self.path = path
        self.mode = mode
        self.dataFrame = copy.deepcopy(dataFrame)
        self.featuresIndex = featuresIndex
        self.labelIndex = labelIndex

        if mode == 'r':
            # Read csv into pandas frame
            self.dataFrame = pandas.read_csv(self.path)
        else:
            # Write pandas frame into csv
            self.dataFrame.to_csv(path, index=False)

        self.nonFeaturesDataFrame = self.dataFrame.ix[:, :self.featuresIndex]
        self.featuresDataFrame = self.dataFrame.ix[:, self.featuresIndex:]

        # As long as there is a label, create the labelSeries
        if not labelIndex == None:
            self.labelSeries = self.dataFrame.iloc[:, labelIndex]

    def __str__(self):
        return self.__class__.__name__ + ' ' + self.description + ', Path: \'' + self.path + '\''

    def __eq__(self, other):
        """
        Compares two data sets on the basis of their paths
        """
        return (isinstance(other, self.__class__)
            and self.path == other.path)

    def __ne__(self, other):
        return not self.__eq__(other)


class SplitDataSet:
    """
    This associates both pieces of a DataSet split into testing and training.
    """
    def __init__(self, trainDataSet, testDataSet):
        self.trainDataSet = trainDataSet
        self.testDataSet = testDataSet

    def __str__(self):
        return self.__class__.__name__ + '\n' + \
               'Training: ' + str(self.trainDataSet) + '\n' + \
               'Testing: ' + str(self.testDataSet)


class Scaler:
    """
    This associates a MinMaxScaler object with the original, unscaled DataSet used for fitting it.
    """
    def __init__(self, dataSetUsedToFit, scalingObject):
        self.dataSetUsedToFit = dataSetUsedToFit
        self.scalingObject = scalingObject

    def __str__(self):
        return self.__class__.__name__ + ' for ' + str(self.dataSetUsedToFit)


class ExtractSpecificFeatures:

    def __init__(self, featureList):
        self.featureList = featureList

    def fit_transform(self, dataFrame):
        return dataFrame[self.featureList]

    def transform(self, dataFrame):
        return dataFrame[self.featureList]


class FeatureEngineeringConfiguration:
    """
    Everything we need to know to perform a specific type of feature selection on a DataSet.
    """
    def __init__(self, description, selectionOrExtraction, method, parameters):
        self.description = description
        self.selectionOrExtraction = selectionOrExtraction
        self.method = method
        self.parameters = parameters

    def __str__(self):
        return self.__class__.__name__ + ' Description: ' + self.description


class Transformer:
    """
    This associates a decomposition or feature_selection object with the original, untransformed DataSet
    used for fitting it.
    """
    def __init__(self, description, selectionOrExtraction, dataSetUsedToFit, transformingObject):
        self.description = description
        self.selectionOrExtraction = selectionOrExtraction
        self.dataSetUsedToFit = dataSetUsedToFit
        self.transformingObject = transformingObject

    def __str__(self):
        return self.__class__.__name__ + ' ' + str(self.description) + \
               ' for ' + str(self.dataSetUsedToFit)


class ModellingMethod:
    """
    This associates an sklearn modelling function with a description for easier processing.
    """
    def __init__(self, description, function):
        self.description = description
        self.function = function

    def __str__(self):
        return self.__class__.__name__ + ' ' + self.description


class TuneModelConfiguration:
    """
    Note: scoreMethod can be a callable object/function or a string ('r2', 'mean_absolute_error',
    'mean_squared_error', or 'median_absolute_error')
    """
    def __init__(self, description, modellingMethod, parameterGrid, scoreMethod):
        self.description = description
        self.modellingMethod = modellingMethod
        self.parameterGrid = parameterGrid
        self.scoreMethod = scoreMethod

    def __str__(self):
        return self.__class__.__name__ + ' Description: ' + self.description + '\n' + \
               'Model: ' + str(self.modellingMethod) + '\n' + \
               'Parameter Grid: ' + str(self.parameterGrid) + '\n' + \
               'Scoring Method: ' + self.scoreMethod


class TuneModelResult:
    """
    The outcome of tuneModel(), which contains everything important found in tuning parameters for a ModellingMethod.
    """
    def __init__(self, description, dataSet, modellingMethod, parameters, scoreMethod, bestScore, gridScores):
        self.description = description
        self.dataSet = dataSet
        self.modellingMethod = modellingMethod
        self.parameters = parameters
        self.scoreMethod = scoreMethod
        self.bestScore = bestScore
        self.gridScores = gridScores

    def __str__(self):
        return self.__class__.__name__ + ' Description: ' + self.description + '\n' + \
               'Dataset: ' + str(self.dataSet) + '\n' + \
               'Model: ' + str(self.modellingMethod) + '\n' + \
               'Tuned Parameters: ' + str(self.parameters) + '\n' + \
               'Scoring Method: ' + self.scoreMethod + '\n' + \
               'Tuned Training Score: ' + str(self.bestScore) + '\n' + \
               'Grid Scores: ' + str(self.gridScores)


class ApplyModelConfiguration:
    """
    Everything needed to use applyModel(): the modelling method, its parameters, the DataSet to train on, and
    the DataSet to predict for. (Note that if no test DataSet is passed in, the applyModel() will predict for the
    training DataSet by default.)
    """
    def __init__(self, description, modellingMethod, parameters, trainDataSet, testDataSet=None):
        self.description = description
        self.modellingMethod = modellingMethod
        self.parameters = parameters
        self.trainDataSet = trainDataSet
        if testDataSet == None:
            testDataSet = trainDataSet
        self.testDataSet = testDataSet

    def __str__(self):
        return self.__class__.__name__ + ' Description: ' + self.description + '\n' + \
               'Model: ' + str(self.modellingMethod) + '\n' + \
               'Parameters: ' + str(self.parameters) + '\n' + \
               'Training Data Set: ' + str(self.trainDataSet) + '\n' + \
               'Testing Data Set: ' + str(self.testDataSet)

class ApplyModelResult:
    """
    The outcome of applyModel(), which can then be scored.
    """
    def __init__(self, description, testPredictions, testDataSet, modellingMethod, parameters):
        self.description = description
        self.testPredictions = testPredictions
        self.testDataSet = testDataSet
        self.modellingMethod = modellingMethod
        self.parameters = parameters

    def __str__(self):
        return self.__class__.__name__ + ' Description: ' + self.description + '\n' + \
               'Testing Data Set: ' + str(self.testDataSet) + '\n' + \
               'Model: ' + str(self.modellingMethod) + '\n' + \
               'Parameters: ' + str(self.parameters)


class ScoreModelResult:
    """
    The outcome of scoreModel: how a ModellingMethod performed on various ModelScoreMethods
    """
    def __init__(self, description, modellingMethod, parameters, modelScores):
        self.description = description
        self.modellingMethod = modellingMethod
        self.parameters = parameters
        self.modelScores = modelScores

    def __str__(self):
        return self.__class__.__name__ + ' Description: ' + self.description + '\n' + \
               'Model: ' + str(self.modellingMethod) + '\n' + \
               'Parameters: ' + str(self.parameters) + '\n' + \
               'Model Scores:\n' + '\n'.join(map(str, self.modelScores)) + '\n'


class ModelScore:
    """
    Relates a score to a ModelScoreMethod.
    """
    def __init__(self, score, modelScoreMethod):
        self.score = score
        self.modelScoreMethod = modelScoreMethod

    def __str__(self):
        return self.__class__.__name__ + ' Scoring Function: ' + str(self.modelScoreMethod) + ' Score: ' + str(self.score)


class ModelScoreMethod:
    """
    This associates an sklearn.metrics scoring function with a description to facilitate processing.
    """
    def __init__(self, description, function):
        self.description = description
        self.function = function

    def __str__(self):
        return self.__class__.__name__ + ' ' + self.description


class AveragingEnsemble:
    """
    This takes a list of models and averages their predictions for a test dataset (optionally as a weighted average),
    mimicking a generic sklearn regression object.
    Predictor configurations contain the functions and parameters necessary to initialize predictors.
    If no weights are passed in, a regular arithmetic mean is calculated.
    Weights should be a list of the same length as predictorConfigurations and in the same order.
    """
    def __init__(self, predictorConfigurations, weights=None):
        if weights != None:
            if len(predictorConfigurations) != len(weights):
                raise Exception('Each predictor configuration needs a weight.\n' +
                                'Number of predictor configurations: ' + str(len(predictorConfigurations)) + '\n' +
                                'Number of weights: ' + str(len(weights)))
        predictors = []
        for predictorConfiguration in predictorConfigurations:
            predictor = predictorConfiguration.predictorFunction(**predictorConfiguration.parameters)
            predictors.append(predictor)
        self.predictors = predictors
        self.weights = weights

    def fit(self, X, y):
        for predictor in self.predictors:
            predictor.fit(X, y)

    def predict(self, X):

        # Make prediction using each predictor
        self.predictions = []
        for predictor in self.predictors:
            prediction = predictor.predict(X)
            self.predictions.append(prediction)

        # Find the average prediction for each observation
        meanPrediction = numpy.average(self.predictions, axis=0, weights=self.weights)
        return meanPrediction

    def __str__(self):
        return self.__class__.__name__ + '\n' + \
               'Predictors: ' + str(self.predictors) + '\n' + \
               'Weights: ' + str(self.weights)


class StackingEnsemble:
    """
    This takes a list of models and stacks their predictions for a dataset, mimicking a generic sklearn regression object.
    Predictor configurations contain the functions and parameters necessary to initialize predictors.
    """
    def __init__(self, basePredictorConfigurations, stackingPredictorConfiguration, includeOriginalFeatures=False):
        basePredictors = []
        for basePredictorConfiguration in basePredictorConfigurations:
            basePredictor = basePredictorConfiguration.predictorFunction(**basePredictorConfiguration.parameters)
            basePredictors.append(basePredictor)
        self.basePredictors = basePredictors
        self.stackingPredictor = stackingPredictorConfiguration.predictorFunction(**stackingPredictorConfiguration.parameters)
        self.includeOriginalFeatures = includeOriginalFeatures

    def fit(self, X, y):

        # Building a dataframe of each base predictor's predictions
        basePredictions = pandas.DataFrame()
        counter = 1
        for basePredictor in self.basePredictors:
            basePredictor.fit(X, y)
            basePrediction = basePredictor.predict(X)
            basePredictions['Base Predictor ' + str(counter)] = basePrediction
            counter += 1

        if self.includeOriginalFeatures:
            basePredictions = pandas.concat([basePredictions, X], axis=1)

        # Use this new dataframe to fit the stacking predictor
        self.stackingPredictor.fit(basePredictions, y)


    def predict(self, X):

        # Building a dataframe of each base predictor's predictions
        basePredictions = pandas.DataFrame()
        for basePredictor in self.basePredictors:
            basePrediction = basePredictor.predict(X)
            basePredictions[str(basePredictor)] = basePrediction

        if self.includeOriginalFeatures:
            basePredictions = pandas.concat([basePredictions, X], axis=1)

        # Use this new dataframe to predict using the stacking predictor
        stackedPrediction = self.stackingPredictor.predict(basePredictions)
        return stackedPrediction

    def __str__(self):
        return self.__class__.__name__ + '\n' + \
               'Stacking predictor: ' + str(self.stackingPredictor) + '\n' + \
               'Base predictors: ' + str(self.basePredictors)


class PredictorConfiguration:
    """
    If no parameters are provided, defaults for that predictor function are used
    """
    def __init__(self, description, predictorFunction, parameters=None):
        self.description = description
        self.predictorFunction = predictorFunction
        self.parameters = parameters

    def __str__(self):
        return self.__class__.__name__ + ' ' + self.description + ' ' + str(self.parameters)

    def __repr__(self):
        return self.description + ' ' + str(self.parameters)