-
Notifications
You must be signed in to change notification settings - Fork 1
/
classification.py
583 lines (481 loc) · 22.8 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
from xai_components.base import InArg, OutArg, Component, xai_component
from IPython.utils import capture
@xai_component(color="blue")
class SetupClassification(Component):
"""
Initializes the training environment and creates the transformation pipeline.
Setup must be called before executing any other component.
##### inPorts:
- in_dataset (any): Shape (n_samples, n_features), where n_samples is the number of samples and n_features is the number of features.
- target (str): Name of the target column to be passed in as a string. The target variable can be either binary or multiclass.
- train_size_fraction (float): Proportion of the dataset to be used for training and validation. Should be between 0.0 and 1.0.
- normalize (bool): When set to True, it transforms the numeric features by scaling them to a given range.
- transformation (bool): When set to True, it applies the power transform to make data more Gaussian-like.
- remove_multicollinearity (bool): When set to True, features with inter-correlations higher than the defined threshold are removed.
- multicollinearity_threshold (float): Threshold for correlated features. Ignored when remove_multicollinearity is not True.
- bin_numeric_features (any): To convert numeric features into categorical. It takes a list of strings with column names that are related.
- group_features (any): When the dataset contains features with related characteristics, group_features parameter can be used for feature extraction. It takes a list of strings with column names that are related.
- ignore_features (list): Ignore_features param can be used to ignore features during model training. It takes a list of strings with column names that are to be ignored.
- seed (int): You can use random_state for reproducibility.
- log_experiment (bool): Logging setup and training.
- experiment_name (str): Name of the experiment for logging.
- use_gpu (bool): Whether to use GPU for training.
"""
in_dataset: InArg[any]
target: InArg[str]
train_size_fraction: InArg[float]
normalize: InArg[bool]
transformation: InArg[bool]
remove_multicollinearity: InArg[bool]
multicollinearity_threshold: InArg[float]
bin_numeric_features: InArg[any]
group_features: InArg[any]
ignore_features: InArg[list]
seed: InArg[int]
log_experiment: InArg[bool]
experiment_name: InArg[str]
use_gpu: InArg[bool]
def __init__(self):
super().__init__()
self.target.value = 'default'
self.train_size_fraction.value = 1.0
self.normalize.value = False
self.transformation.value = False
self.remove_multicollinearity.value = False
self.multicollinearity_threshold.value = 0.9
self.experiment_name.value = 'default'
self.use_gpu.value = False
def execute(self, ctx) -> None:
from pycaret.classification import setup, models
import warnings
# Validate and reformat group_features if necessary
if isinstance(self.group_features.value, dict):
for key, group in self.group_features.value.items():
# Flatten lists within group values if needed
if isinstance(group, list):
# Ensure all items in the group list are strings
self.group_features.value[key] = [str(g) for g in group]
else:
warnings.warn(f"Invalid format in group '{key}'; setting it to an empty list.")
self.group_features.value[key] = []
else:
warnings.warn("`group_features` should be a dictionary with string keys and list values. Setting to None.")
self.group_features.value = None
if self.seed.value is None:
print("Set the seed value for reproducibility.")
with capture.capture_output() as captured:
setup_pycaret = setup(
data=self.in_dataset.value,
target=self.target.value,
train_size=self.train_size_fraction.value,
normalize=self.normalize.value,
transformation=self.transformation.value,
remove_multicollinearity=self.remove_multicollinearity.value,
multicollinearity_threshold=self.multicollinearity_threshold.value,
bin_numeric_features=self.bin_numeric_features.value,
group_features=self.group_features.value,
ignore_features=self.ignore_features.value,
session_id=self.seed.value,
log_experiment=self.log_experiment.value,
experiment_name=self.experiment_name.value,
use_gpu=self.use_gpu.value,
n_jobs=1,
)
captured.show()
print("List of the Available Classification Models: ")
print(models())
@xai_component(color="firebrick")
class CompareModelsClassification(Component):
"""
Trains and evaluates performance of all estimators available in the model library using cross validation.
The output of this component is a score grid with average cross-validated scores.
##### inPorts:
- sort_by (str): The sort order of the score grid.
- exclude (list): To omit certain models from training and evaluation, pass a list containing model id in the exclude parameter.
- num_top (int): Number of top_n models to return.
##### outPorts:
- top_models (any): List of top models.
"""
sort_by: InArg[str]
exclude: InArg[list]
num_top: InArg[int]
top_models: OutArg[any]
def __init__(self):
super().__init__()
self.sort_by.value = 'Accuracy'
self.num_top.value = 1
def execute(self, ctx) -> None:
from pycaret.classification import compare_models
with capture.capture_output() as captured:
best_model = compare_models(
sort=self.sort_by.value,
exclude=self.exclude.value,
n_select=self.num_top.value
)
captured.show()
print('Best ' + str(self.num_top.value) + ' Model:', best_model)
self.top_models.value = best_model
@xai_component(color="orange")
class CreateModelClassification(Component):
"""
Trains and evaluates the performance of a given estimator using cross validation.
The output of this component is a score grid with CV scores by fold.
##### inPorts:
- model_id (str): ID of an estimator available in the model library or pass an untrained model object consistent with scikit-learn API.
- num_fold (int): Controls cross-validation. If None, the CV generator in the fold_strategy parameter of the setup function is used.
##### outPorts:
- out_created_model (any): Trained Model object.
"""
model_id: InArg[str]
num_fold: InArg[int]
out_created_model: OutArg[any]
def __init__(self):
super().__init__()
self.model_id.value = 'lr'
self.num_fold.value = 10
def execute(self, ctx) -> None:
from pycaret.classification import create_model
with capture.capture_output() as captured:
created_model = create_model(
estimator=self.model_id.value,
fold=self.num_fold.value
)
captured.show()
print(created_model)
self.out_created_model.value = created_model
@xai_component(color="salmon")
class TuneModelClassification(Component):
"""
Tunes the hyperparameters of a given model. The output of this component is
a score grid with CV scores by fold of the best selected model based on optimize parameter.
##### inPorts:
- in_model (any): Trained model object.
- optimize (str): Metric name to be evaluated for hyperparameter tuning.
- early_stopping_patience (int): Maximum number of epochs to run for each sampled configuration.
- num_fold (int): Controls cross-validation. If None, the CV generator in the fold_strategy parameter of the setup function is used.
- n_iter (int): Number of iterations in the grid search. Increasing ‘n_iter’ may improve model performance but also increases the training time.
- custom_grid (any): To define custom search space for hyperparameters, pass a dictionary with parameter name and values to be iterated.
##### outPorts:
- out_tuned_model (any): Tuned model object.
"""
in_model: InArg[any]
optimize: InArg[str]
early_stopping_patience: InArg[int]
num_fold: InArg[int]
n_iter: InArg[int]
custom_grid: InArg[any]
out_tuned_model: OutArg[any]
def __init__(self):
super().__init__()
self.optimize.value = "Accuracy"
self.num_fold.value = 10
self.n_iter.value = 10
def execute(self, ctx) -> None:
from pycaret.classification import tune_model
from IPython.display import display
early_stopping = self.early_stopping_patience.value is not None
with capture.capture_output() as captured:
tuned_model = tune_model(
estimator=self.in_model.value,
optimize=self.optimize.value,
fold=self.num_fold.value,
n_iter=self.n_iter.value,
early_stopping=early_stopping,
early_stopping_max_iters=self.early_stopping_patience.value if early_stopping else 10,
custom_grid=self.custom_grid.value
)
for o in captured.outputs:
display(o)
self.out_tuned_model.value = tuned_model
@xai_component(color="springgreen")
class PlotModelClassification(Component):
"""
Analyzes the performance of a trained model on holdout set. It may require re-training the model in certain cases.
##### inPorts:
- in_model (any): Trained model object.
- plot_type (str): Plot name.
- list_available_plots (bool): List the available plots.
##### outPorts:
- out_model (any): Trained model object.
"""
in_model: InArg[any]
plot_type: InArg[str]
list_available_plots: InArg[bool]
out_model: OutArg[any]
def __init__(self):
super().__init__()
self.plot_type.value = 'auc'
self.list_available_plots.value = False
def execute(self, ctx) -> None:
from pycaret.classification import plot_model
plot = {
'auc': 'Area Under the Curve', 'threshold': 'Discrimination Threshold', 'pr': 'Precision Recall Curve',
'confusion_matrix': 'Confusion Matrix', 'error': 'Class Prediction Error', 'class_report': 'Classification Report',
'boundary': 'Decision Boundary', 'rfe': 'Recursive Feature Selection', 'learning': 'Learning Curve',
'manifold': 'Manifold Learning', 'calibration': 'Calibration Curve', 'vc': 'Validation Curve',
'dimension': 'Dimension Learning', 'feature': 'Feature Importance', 'feature_all': 'Feature Importance (All)',
'parameter': 'Model Hyperparameter', 'lift': 'Lift Curve', 'gain': 'Gain Chart', 'tree': 'Decision Tree', 'ks': 'KS Statistic Plot'
}
with capture.capture_output() as captured:
plot_model(self.in_model.value, plot=self.plot_type.value)
captured.show()
if self.list_available_plots.value:
print('List of available plots (plot Type - Plot Name):')
for key, value in plot.items():
print(key, ' - ', value)
self.out_model.value = self.in_model.value
@xai_component(color='crimson')
class FinalizeModelClassification(Component):
"""
Trains a given estimator on the entire dataset including the holdout set.
##### inPorts:
- in_model (any): Trained model object.
##### outPorts:
- out_finalize_model (any): Trained model object.
"""
in_model: InArg[any]
out_finalize_model: OutArg[any]
def execute(self, ctx) -> None:
from pycaret.classification import finalize_model
print(self.in_model.value)
with capture.capture_output() as captured:
out_finalize_model = finalize_model(self.in_model.value)
print(out_finalize_model)
captured.show()
self.out_finalize_model.value = out_finalize_model
@xai_component(color='darkviolet')
class PredictModelClassification(Component):
"""
Predicts Label and Score (probability of predicted class) using a trained model.
When data is None, it predicts label and score on the holdout set.
##### inPorts:
- in_model (any): Trained model object.
- predict_dataset (any): Shape (n_samples, n_features). All features used during training must be available in the unseen dataset.
##### outPorts:
- out_model (any): pandas.DataFrame with prediction and score columns.
"""
in_model: InArg[any]
predict_dataset: InArg[any]
prediction_results: OutArg[any]
out_model: OutArg[any]
def execute(self, ctx) -> None:
from pycaret.classification import predict_model
with capture.capture_output() as captured:
prediction = predict_model(self.in_model.value, data=self.predict_dataset.value)
captured.show()
self.prediction_results.value = prediction
self.out_model.value = self.in_model.value
@xai_component(color='red')
class SaveModelClassification(Component):
"""
Saves the transformation pipeline and trained model object into the current working directory as a pickle file for later use.
##### inPorts:
- in_model (any): Trained model object.
- save_path (str): Name and saving path of the model.
- model_only (bool): When set to True, only the trained model object is saved instead of the entire pipeline.
"""
in_model: InArg[any]
save_path: InArg[str]
model_only: InArg[bool]
def execute(self, ctx) -> None:
from pycaret.classification import save_model
save_model(self.in_model.value, model_name=self.save_path.value, model_only=self.model_only.value)
@xai_component(color='red')
class LoadModelClassification(Component):
"""
Loads a previously saved pipeline.
##### inPorts:
- model_path (str): Name and path of the saved model.
##### outPorts:
- model (any): Trained model object.
"""
model_path: InArg[str]
model: OutArg[any]
def execute(self, ctx) -> None:
from pycaret.classification import load_model
self.model.value = load_model(model_name=self.model_path.value)
@xai_component(color='gold')
class EnsembleModelClassification(Component):
"""
Ensembles a given estimator. The output of this function is a score grid with CV scores by fold.
##### inPorts:
- in_model (any): Trained model object.
- method (str): Method for ensembling base estimator. It can be ‘Bagging’ or ‘Boosting’.
- choose_better (bool): When set to True, the returned object is always better performing. The metric used for comparison is defined by the optimize parameter.
- optimize (str): Metric to compare for model selection when choose_better is True.
- n_estimators (int): The number of base estimators in the ensemble. In case of perfect fit, the learning procedure is stopped early.
##### outPorts:
- out_ensemble_model (any): Trained model object.
"""
in_model: InArg[any]
method: InArg[str]
choose_better: InArg[bool]
optimize: InArg[str]
n_estimators: InArg[int]
out_ensemble_model: OutArg[any]
def __init__(self):
super().__init__()
self.method.value = 'Bagging'
self.choose_better.value = False
self.optimize.value = 'Accuracy'
self.n_estimators.value = 10
def execute(self, ctx) -> None:
from pycaret.classification import ensemble_model
with capture.capture_output() as captured:
ensembled_model = ensemble_model(
estimator=self.in_model.value,
method=self.method.value,
choose_better=self.choose_better.value,
optimize=self.optimize.value,
n_estimators=self.n_estimators.value
)
captured.show()
print('Ensemble model:', ensembled_model)
self.out_ensemble_model.value = ensembled_model
@xai_component(color='greenyellow')
class BlendModelsClassification(Component):
"""
Trains a Soft Voting / Majority Rule classifier for select models passed in the top_model list.
##### inPorts:
- top_models (any): List of trained model objects from CompareModel component.
- model_1 (any): First model to blend.
- model_2 (any): Second model to blend.
- model_3 (any): Third model to blend.
- method (str): ‘hard’ uses predicted class labels for majority rule voting. ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers. Default value, ‘auto’, will try to use ‘soft’ and fall back to ‘hard’ if the former is not supported.
- choose_better (bool): When set to True, the returned object is always better performing. The metric used for comparison is defined by the optimize parameter.
- optimize (str): Metric to compare for model selection when choose_better is True.
##### outPorts:
- out_blended_model (any): Blended model object.
"""
top_models: InArg[any]
model_1: InArg[any]
model_2: InArg[any]
model_3: InArg[any]
method: InArg[str]
choose_better: InArg[bool]
optimize: InArg[str]
out_blended_model: OutArg[any]
def __init__(self):
super().__init__()
self.method.value = 'auto'
self.choose_better.value = False
self.optimize.value = 'Accuracy'
def execute(self, ctx) -> None:
from pycaret.classification import blend_models
model_list = self.top_models.value
model_1 = self.model_1.value
model_2 = self.model_2.value
model_3 = self.model_3.value
method = self.method.value
choose_better = self.choose_better.value
optimize = self.optimize.value
if model_list is None:
blend_model = [model_1, model_2, model_3]
model_list = [i for i in blend_model if i]
with capture.capture_output() as captured:
blend_model = blend_models(estimator_list=model_list, method=method, choose_better=choose_better, optimize=optimize)
captured.show()
self.out_blended_model.value = blend_model
@xai_component(color='lawngreen')
class StackModelsClassification(Component):
"""
Trains a meta model over select estimators passed in the estimator_list parameter.
The output of this function is a score grid with CV scores by fold.
##### inPorts:
- top_models (any): List of trained model objects from CompareModel component.
- model_1 (any): First model to stack.
- model_2 (any): Second model to stack.
- model_3 (any): Third model to stack.
- meta_model (any): When None, Logistic Regression is trained as a meta model.
- method (str): When set to ‘auto’, it will invoke, for each estimator, ‘predict_proba’, ‘decision_function’ or ‘predict’ in that order.
- choose_better (bool): When set to True, the returned object is always better performing. The metric used for comparison is defined by the optimize parameter.
- optimize (str): Metric to compare for model selection when choose_better is True.
##### outPorts:
- out_stacked_model (any): Trained model object.
"""
top_models: InArg[any]
model_1: InArg[any]
model_2: InArg[any]
model_3: InArg[any]
meta_model: InArg[any]
method: InArg[str]
choose_better: InArg[bool]
optimize: InArg[str]
out_stacked_model: OutArg[any]
def __init__(self):
super().__init__()
self.method.value = 'auto'
self.choose_better.value = False
self.optimize.value = 'Accuracy'
def execute(self, ctx) -> None:
from pycaret.classification import stack_models
model_list = self.top_models.value
model_1 = self.model_1.value
model_2 = self.model_2.value
model_3 = self.model_3.value
meta_model = self.meta_model.value
method = self.method.value
choose_better = self.choose_better.value
optimize = self.optimize.value
if model_list is None:
blend_model = [model_1, model_2, model_3]
model_list = [i for i in blend_model if i]
with capture.capture_output() as captured:
stacked_model = stack_models(
estimator_list=model_list,
meta_model=meta_model,
method=method,
choose_better=choose_better,
optimize=optimize
)
captured.show()
print('Stacked models:', stacked_model.estimators_)
self.out_stacked_model.value = stacked_model
@xai_component(color='steelblue')
class CalibrateModelClassification(Component):
"""
Calibrates the probability of a given estimator using isotonic or logistic regression.
The output of this function is a score grid with CV scores by fold.
##### inPorts:
- in_model (any): Trained model object.
- method (str): The method to use for calibration. Can be ‘sigmoid’ which corresponds to Platt’s method or ‘isotonic’ which is a non-parametric approach.
- calibrate_fold (int): Controls internal cross-validation. Can be an integer or a scikit-learn CV generator.
##### outPorts:
- out_calibrate_model (any): Calibrated model object.
"""
in_model: InArg[any]
method: InArg[str]
calibrate_fold: InArg[int]
out_calibrate_model: OutArg[any]
def __init__(self):
super().__init__()
self.method.value = 'sigmoid'
self.calibrate_fold.value = 5
def execute(self, ctx) -> None:
from pycaret.classification import calibrate_model
with capture.capture_output() as captured:
calibrated_model = calibrate_model(
estimator=self.in_model.value,
method=self.method.value,
calibrate_fold=self.calibrate_fold.value
)
captured.show()
self.out_calibrate_model.value = calibrated_model
@xai_component
class AutoMLClassification(Component):
"""
Returns the best model out of all trained models in the current session based on the optimize parameter.
Metrics evaluated can be accessed using the get_metrics function.
##### inPorts:
- optimize (str): Metric to use for model selection. It also accepts custom metrics added using the add_metric function.
##### outPorts:
- best_model (any): Best trained model object.
"""
optimize: InArg[str]
best_model: OutArg[any]
def __init__(self):
super().__init__()
self.optimize.value = 'Accuracy'
def execute(self, ctx) -> None:
from pycaret.classification import automl
self.best_model.value = automl(optimize=self.optimize.value)