-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtypes.py
More file actions
361 lines (290 loc) · 13.7 KB
/
types.py
File metadata and controls
361 lines (290 loc) · 13.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
import pandas
import numpy
import copy
class DataSet:
"""
Everything we need to know about a data set to use it in our pipeline. It either reads in
a csv at a specific file location and creates a Pandas data frame or uses a Pandas data frame
to write out a csv to a specific file location.
featuresIndex should be index of column at which your features start, counting from 0. The default assumption is
that the first column is the label and the second column starts the features.
"""
def __init__(self, description, path, mode='r', dataFrame=None, featuresIndex=1, labelIndex=0):
self.description = description
self.path = path
self.mode = mode
self.dataFrame = copy.deepcopy(dataFrame)
self.featuresIndex = featuresIndex
self.labelIndex = labelIndex
if mode == 'r':
# Read csv into pandas frame
self.dataFrame = pandas.read_csv(self.path)
else:
# Write pandas frame into csv
self.dataFrame.to_csv(path, index=False)
self.nonFeaturesDataFrame = self.dataFrame.ix[:, :self.featuresIndex]
self.featuresDataFrame = self.dataFrame.ix[:, self.featuresIndex:]
# As long as there is a label, create the labelSeries
if not labelIndex == None:
self.labelSeries = self.dataFrame.iloc[:, labelIndex]
def __str__(self):
return self.__class__.__name__ + ' ' + self.description + ', Path: \'' + self.path + '\''
def __eq__(self, other):
"""
Compares two data sets on the basis of their paths
"""
return (isinstance(other, self.__class__)
and self.path == other.path)
def __ne__(self, other):
return not self.__eq__(other)
class SplitDataSet:
"""
This associates both pieces of a DataSet split into testing and training.
"""
def __init__(self, trainDataSet, testDataSet):
self.trainDataSet = trainDataSet
self.testDataSet = testDataSet
def __str__(self):
return self.__class__.__name__ + '\n' + \
'Training: ' + str(self.trainDataSet) + '\n' + \
'Testing: ' + str(self.testDataSet)
class Scaler:
"""
This associates a MinMaxScaler object with the original, unscaled DataSet used for fitting it.
"""
def __init__(self, dataSetUsedToFit, scalingObject):
self.dataSetUsedToFit = dataSetUsedToFit
self.scalingObject = scalingObject
def __str__(self):
return self.__class__.__name__ + ' for ' + str(self.dataSetUsedToFit)
class ExtractSpecificFeatures:
def __init__(self, featureList):
self.featureList = featureList
def fit_transform(self, dataFrame):
return dataFrame[self.featureList]
def transform(self, dataFrame):
return dataFrame[self.featureList]
class FeatureEngineeringConfiguration:
"""
Everything we need to know to perform a specific type of feature selection on a DataSet.
"""
def __init__(self, description, selectionOrExtraction, method, parameters):
self.description = description
self.selectionOrExtraction = selectionOrExtraction
self.method = method
self.parameters = parameters
def __str__(self):
return self.__class__.__name__ + ' Description: ' + self.description
class Transformer:
"""
This associates a decomposition or feature_selection object with the original, untransformed DataSet
used for fitting it.
"""
def __init__(self, description, selectionOrExtraction, dataSetUsedToFit, transformingObject):
self.description = description
self.selectionOrExtraction = selectionOrExtraction
self.dataSetUsedToFit = dataSetUsedToFit
self.transformingObject = transformingObject
def __str__(self):
return self.__class__.__name__ + ' ' + str(self.description) + \
' for ' + str(self.dataSetUsedToFit)
class ModellingMethod:
"""
This associates an sklearn modelling function with a description for easier processing.
"""
def __init__(self, description, function):
self.description = description
self.function = function
def __str__(self):
return self.__class__.__name__ + ' ' + self.description
class TuneModelConfiguration:
"""
Note: scoreMethod can be a callable object/function or a string ('r2', 'mean_absolute_error',
'mean_squared_error', or 'median_absolute_error')
"""
def __init__(self, description, modellingMethod, parameterGrid, scoreMethod):
self.description = description
self.modellingMethod = modellingMethod
self.parameterGrid = parameterGrid
self.scoreMethod = scoreMethod
def __str__(self):
return self.__class__.__name__ + ' Description: ' + self.description + '\n' + \
'Model: ' + str(self.modellingMethod) + '\n' + \
'Parameter Grid: ' + str(self.parameterGrid) + '\n' + \
'Scoring Method: ' + self.scoreMethod
class TuneModelResult:
"""
The outcome of tuneModel(), which contains everything important found in tuning parameters for a ModellingMethod.
"""
def __init__(self, description, dataSet, modellingMethod, parameters, scoreMethod, bestScore, gridScores):
self.description = description
self.dataSet = dataSet
self.modellingMethod = modellingMethod
self.parameters = parameters
self.scoreMethod = scoreMethod
self.bestScore = bestScore
self.gridScores = gridScores
def __str__(self):
return self.__class__.__name__ + ' Description: ' + self.description + '\n' + \
'Dataset: ' + str(self.dataSet) + '\n' + \
'Model: ' + str(self.modellingMethod) + '\n' + \
'Tuned Parameters: ' + str(self.parameters) + '\n' + \
'Scoring Method: ' + self.scoreMethod + '\n' + \
'Tuned Training Score: ' + str(self.bestScore) + '\n' + \
'Grid Scores: ' + str(self.gridScores)
class ApplyModelConfiguration:
"""
Everything needed to use applyModel(): the modelling method, its parameters, the DataSet to train on, and
the DataSet to predict for. (Note that if no test DataSet is passed in, the applyModel() will predict for the
training DataSet by default.)
"""
def __init__(self, description, modellingMethod, parameters, trainDataSet, testDataSet=None):
self.description = description
self.modellingMethod = modellingMethod
self.parameters = parameters
self.trainDataSet = trainDataSet
if testDataSet == None:
testDataSet = trainDataSet
self.testDataSet = testDataSet
def __str__(self):
return self.__class__.__name__ + ' Description: ' + self.description + '\n' + \
'Model: ' + str(self.modellingMethod) + '\n' + \
'Parameters: ' + str(self.parameters) + '\n' + \
'Training Data Set: ' + str(self.trainDataSet) + '\n' + \
'Testing Data Set: ' + str(self.testDataSet)
class ApplyModelResult:
"""
The outcome of applyModel(), which can then be scored.
"""
def __init__(self, description, testPredictions, testDataSet, modellingMethod, parameters):
self.description = description
self.testPredictions = testPredictions
self.testDataSet = testDataSet
self.modellingMethod = modellingMethod
self.parameters = parameters
def __str__(self):
return self.__class__.__name__ + ' Description: ' + self.description + '\n' + \
'Testing Data Set: ' + str(self.testDataSet) + '\n' + \
'Model: ' + str(self.modellingMethod) + '\n' + \
'Parameters: ' + str(self.parameters)
class ScoreModelResult:
"""
The outcome of scoreModel: how a ModellingMethod performed on various ModelScoreMethods
"""
def __init__(self, description, modellingMethod, parameters, modelScores):
self.description = description
self.modellingMethod = modellingMethod
self.parameters = parameters
self.modelScores = modelScores
def __str__(self):
return self.__class__.__name__ + ' Description: ' + self.description + '\n' + \
'Model: ' + str(self.modellingMethod) + '\n' + \
'Parameters: ' + str(self.parameters) + '\n' + \
'Model Scores:\n' + '\n'.join(map(str, self.modelScores)) + '\n'
class ModelScore:
"""
Relates a score to a ModelScoreMethod.
"""
def __init__(self, score, modelScoreMethod):
self.score = score
self.modelScoreMethod = modelScoreMethod
def __str__(self):
return self.__class__.__name__ + ' Scoring Function: ' + str(self.modelScoreMethod) + ' Score: ' + str(self.score)
class ModelScoreMethod:
"""
This associates an sklearn.metrics scoring function with a description to facilitate processing.
"""
def __init__(self, description, function):
self.description = description
self.function = function
def __str__(self):
return self.__class__.__name__ + ' ' + self.description
class AveragingEnsemble:
"""
This takes a list of models and averages their predictions for a test dataset (optionally as a weighted average),
mimicking a generic sklearn regression object.
Predictor configurations contain the functions and parameters necessary to initialize predictors.
If no weights are passed in, a regular arithmetic mean is calculated.
Weights should be a list of the same length as predictorConfigurations and in the same order.
"""
def __init__(self, predictorConfigurations, weights=None):
if weights != None:
if len(predictorConfigurations) != len(weights):
raise Exception('Each predictor configuration needs a weight.\n' +
'Number of predictor configurations: ' + str(len(predictorConfigurations)) + '\n' +
'Number of weights: ' + str(len(weights)))
predictors = []
for predictorConfiguration in predictorConfigurations:
predictor = predictorConfiguration.predictorFunction(**predictorConfiguration.parameters)
predictors.append(predictor)
self.predictors = predictors
self.weights = weights
def fit(self, X, y):
for predictor in self.predictors:
predictor.fit(X, y)
def predict(self, X):
# Make prediction using each predictor
self.predictions = []
for predictor in self.predictors:
prediction = predictor.predict(X)
self.predictions.append(prediction)
# Find the average prediction for each observation
meanPrediction = numpy.average(self.predictions, axis=0, weights=self.weights)
return meanPrediction
def __str__(self):
return self.__class__.__name__ + '\n' + \
'Predictors: ' + str(self.predictors) + '\n' + \
'Weights: ' + str(self.weights)
class StackingEnsemble:
"""
This takes a list of models and stacks their predictions for a dataset, mimicking a generic sklearn regression object.
Predictor configurations contain the functions and parameters necessary to initialize predictors.
"""
def __init__(self, basePredictorConfigurations, stackingPredictorConfiguration, includeOriginalFeatures=False):
basePredictors = []
for basePredictorConfiguration in basePredictorConfigurations:
basePredictor = basePredictorConfiguration.predictorFunction(**basePredictorConfiguration.parameters)
basePredictors.append(basePredictor)
self.basePredictors = basePredictors
self.stackingPredictor = stackingPredictorConfiguration.predictorFunction(**stackingPredictorConfiguration.parameters)
self.includeOriginalFeatures = includeOriginalFeatures
def fit(self, X, y):
# Building a dataframe of each base predictor's predictions
basePredictions = pandas.DataFrame()
counter = 1
for basePredictor in self.basePredictors:
basePredictor.fit(X, y)
basePrediction = basePredictor.predict(X)
basePredictions['Base Predictor ' + str(counter)] = basePrediction
counter += 1
if self.includeOriginalFeatures:
basePredictions = pandas.concat([basePredictions, X], axis=1)
# Use this new dataframe to fit the stacking predictor
self.stackingPredictor.fit(basePredictions, y)
def predict(self, X):
# Building a dataframe of each base predictor's predictions
basePredictions = pandas.DataFrame()
for basePredictor in self.basePredictors:
basePrediction = basePredictor.predict(X)
basePredictions[str(basePredictor)] = basePrediction
if self.includeOriginalFeatures:
basePredictions = pandas.concat([basePredictions, X], axis=1)
# Use this new dataframe to predict using the stacking predictor
stackedPrediction = self.stackingPredictor.predict(basePredictions)
return stackedPrediction
def __str__(self):
return self.__class__.__name__ + '\n' + \
'Stacking predictor: ' + str(self.stackingPredictor) + '\n' + \
'Base predictors: ' + str(self.basePredictors)
class PredictorConfiguration:
"""
If no parameters are provided, defaults for that predictor function are used
"""
def __init__(self, description, predictorFunction, parameters=None):
self.description = description
self.predictorFunction = predictorFunction
self.parameters = parameters
def __str__(self):
return self.__class__.__name__ + ' ' + self.description + ' ' + str(self.parameters)
def __repr__(self):
return self.description + ' ' + str(self.parameters)