SuperML R package is designed to unify the model training process in R like Python. Generally, it’s seen that people spend lot of time in searching for packages, figuring out the syntax for training machine learning models in R. This behaviour is highly apparent in users who frequently switch between R and Python. This package provides a python´s scikit-learn interface (fit
, predict
) to train models faster.
In addition to building machine learning models, there are handy functionalities to do feature engineering
This ambitious package is my ongoing effort to help the r-community build ML models easily and faster in R.
You can install latest cran version using (recommended):
install.packages("superml")
You can install the developmemt version directly from github using:
devtools::install_github("saraswatmks/superml")
This package uses existing r-packages to build machine learning model. In this tutorial, we’ll use data.table R package to do all tasks related to data manipulation.
We’ll quickly prepare the data set to be ready to served for model training.
load("../data/reg_train.rda")
# if the above doesn't work, you can try: load("reg_train.rda")
library(data.table)
library(caret)
#> Loading required package: lattice
#> Loading required package: ggplot2
library(superml)
#> Loading required package: R6
library(Metrics)
#>
#> Attaching package: 'Metrics'
#> The following objects are masked from 'package:caret':
#>
#> precision, recall
head(reg_train)
#> Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
#> 1: 1 60 RL 65 8450 Pave <NA> Reg
#> 2: 2 20 RL 80 9600 Pave <NA> Reg
#> 3: 3 60 RL 68 11250 Pave <NA> IR1
#> 4: 4 70 RL 60 9550 Pave <NA> IR1
#> 5: 5 60 RL 84 14260 Pave <NA> IR1
#> 6: 6 50 RL 85 14115 Pave <NA> IR1
#> LandContour Utilities LotConfig LandSlope Neighborhood Condition1
#> 1: Lvl AllPub Inside Gtl CollgCr Norm
#> 2: Lvl AllPub FR2 Gtl Veenker Feedr
#> 3: Lvl AllPub Inside Gtl CollgCr Norm
#> 4: Lvl AllPub Corner Gtl Crawfor Norm
#> 5: Lvl AllPub FR2 Gtl NoRidge Norm
#> 6: Lvl AllPub Inside Gtl Mitchel Norm
#> Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
#> 1: Norm 1Fam 2Story 7 5 2003
#> 2: Norm 1Fam 1Story 6 8 1976
#> 3: Norm 1Fam 2Story 7 5 2001
#> 4: Norm 1Fam 2Story 7 5 1915
#> 5: Norm 1Fam 2Story 8 5 2000
#> 6: Norm 1Fam 1.5Fin 5 5 1993
#> YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
#> 1: 2003 Gable CompShg VinylSd VinylSd BrkFace
#> 2: 1976 Gable CompShg MetalSd MetalSd None
#> 3: 2002 Gable CompShg VinylSd VinylSd BrkFace
#> 4: 1970 Gable CompShg Wd Sdng Wd Shng None
#> 5: 2000 Gable CompShg VinylSd VinylSd BrkFace
#> 6: 1995 Gable CompShg VinylSd VinylSd None
#> MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
#> 1: 196 Gd TA PConc Gd TA
#> 2: 0 TA TA CBlock Gd TA
#> 3: 162 Gd TA PConc Gd TA
#> 4: 0 TA TA BrkTil TA Gd
#> 5: 350 Gd TA PConc Gd TA
#> 6: 0 TA TA Wood Gd TA
#> BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
#> 1: No GLQ 706 Unf 0 150
#> 2: Gd ALQ 978 Unf 0 284
#> 3: Mn GLQ 486 Unf 0 434
#> 4: No ALQ 216 Unf 0 540
#> 5: Av GLQ 655 Unf 0 490
#> 6: No GLQ 732 Unf 0 64
#> TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF
#> 1: 856 GasA Ex Y SBrkr 856 854
#> 2: 1262 GasA Ex Y SBrkr 1262 0
#> 3: 920 GasA Ex Y SBrkr 920 866
#> 4: 756 GasA Gd Y SBrkr 961 756
#> 5: 1145 GasA Ex Y SBrkr 1145 1053
#> 6: 796 GasA Ex Y SBrkr 796 566
#> LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
#> 1: 0 1710 1 0 2 1
#> 2: 0 1262 0 1 2 0
#> 3: 0 1786 1 0 2 1
#> 4: 0 1717 1 0 1 0
#> 5: 0 2198 1 0 2 1
#> 6: 0 1362 1 0 1 1
#> BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
#> 1: 3 1 Gd 8 Typ
#> 2: 3 1 TA 6 Typ
#> 3: 3 1 Gd 6 Typ
#> 4: 3 1 Gd 7 Typ
#> 5: 4 1 Gd 9 Typ
#> 6: 1 1 TA 5 Typ
#> Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
#> 1: 0 <NA> Attchd 2003 RFn 2
#> 2: 1 TA Attchd 1976 RFn 2
#> 3: 1 TA Attchd 2001 RFn 2
#> 4: 1 Gd Detchd 1998 Unf 3
#> 5: 1 TA Attchd 2000 RFn 3
#> 6: 0 <NA> Attchd 1993 Unf 2
#> GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
#> 1: 548 TA TA Y 0 61
#> 2: 460 TA TA Y 298 0
#> 3: 608 TA TA Y 0 42
#> 4: 642 TA TA Y 0 35
#> 5: 836 TA TA Y 192 84
#> 6: 480 TA TA Y 40 30
#> EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
#> 1: 0 0 0 0 <NA> <NA> <NA>
#> 2: 0 0 0 0 <NA> <NA> <NA>
#> 3: 0 0 0 0 <NA> <NA> <NA>
#> 4: 272 0 0 0 <NA> <NA> <NA>
#> 5: 0 0 0 0 <NA> <NA> <NA>
#> 6: 0 320 0 0 <NA> MnPrv Shed
#> MiscVal MoSold YrSold SaleType SaleCondition SalePrice
#> 1: 0 2 2008 WD Normal 208500
#> 2: 0 5 2007 WD Normal 181500
#> 3: 0 9 2008 WD Normal 223500
#> 4: 0 2 2006 WD Abnorml 140000
#> 5: 0 12 2008 WD Normal 250000
#> 6: 700 10 2009 WD Normal 143000
split <- createDataPartition(y = reg_train$SalePrice, p = 0.7)
xtrain <- reg_train[split$Resample1]
xtest <- reg_train[!split$Resample1]
# remove features with 90% or more missing values
# we will also remove the Id column because it doesn't contain
# any useful information
na_cols <- colSums(is.na(xtrain)) / nrow(xtrain)
na_cols <- names(na_cols[which(na_cols > 0.9)])
xtrain[, c(na_cols, "Id") := NULL]
xtest[, c(na_cols, "Id") := NULL]
# encode categorical variables
cat_cols <- names(xtrain)[sapply(xtrain, is.character)]
for(c in cat_cols){
lbl <- LabelEncoder$new()
lbl$fit(c(xtrain[[c]], xtest[[c]]))
xtrain[[c]] <- lbl$transform(xtrain[[c]])
xtest[[c]] <- lbl$transform(xtest[[c]])
}
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
# removing noise column
noise <- c('GrLivArea','TotalBsmtSF')
xtrain[, c(noise) := NULL]
xtest[, c(noise) := NULL]
# fill missing value with -1
xtrain[is.na(xtrain)] <- -1
xtest[is.na(xtest)] <- -1
KNN Regression
knn <- KNNTrainer$new(k = 2,prob = T,type = 'reg')
knn$fit(train = xtrain, test = xtest, y = 'SalePrice')
probs <- knn$predict(type = 'prob')
labels <- knn$predict(type='raw')
rmse(actual = xtest$SalePrice, predicted=labels)
#> [1] 4799.556
SVM Regression
svm <- SVMTrainer$new()
#> [1] "For classification, target variable must be factor type. For regression, target variable must be numeric type."
svm$fit(xtrain, 'SalePrice')
#> Warning in svm.default(x = dataX, y = X[[y]], type = self$type, kernel =
#> self$kernel): Variable(s) 'Utilities' constant. Cannot scale data.
pred <- svm$predict(xtest)
rmse(actual = xtest$SalePrice, predicted = pred)
#> [1] 82964.99
Simple Regresison
lf <- LMTrainer$new(family="gaussian")
lf$fit(X = xtrain, y = "SalePrice")
summary(lf$model)
#>
#> Call:
#> stats::glm(formula = f, family = self$family, data = X, weights = self$weights)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -353897 -13524 -994 13091 200214
#>
#> Coefficients: (1 not defined because of singularities)
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -2.221e+06 1.460e+06 -1.522 0.128386
#> MSSubClass -7.534e+01 4.926e+01 -1.529 0.126512
#> MSZoning -1.421e+03 1.347e+03 -1.055 0.291704
#> LotFrontage 4.875e+01 3.091e+01 1.577 0.115140
#> LotArea 3.636e-01 1.305e-01 2.786 0.005441 **
#> Street -7.383e+03 2.132e+04 -0.346 0.729165
#> LotShape 4.074e+03 1.804e+03 2.258 0.024159 *
#> LandContour -4.131e+03 2.124e+03 -1.945 0.052082 .
#> Utilities NA NA NA NA
#> LotConfig 6.803e+02 1.240e+03 0.548 0.583536
#> LandSlope 7.990e+03 4.981e+03 1.604 0.109014
#> Neighborhood 4.910e+01 1.677e+02 0.293 0.769746
#> Condition1 -3.255e+03 7.961e+02 -4.089 4.70e-05 ***
#> Condition2 -1.161e+04 3.023e+03 -3.841 0.000131 ***
#> BldgType -9.726e+02 1.906e+03 -0.510 0.610019
#> HouseStyle -7.974e+02 8.378e+02 -0.952 0.341455
#> OverallQual 1.399e+04 1.295e+03 10.808 < 2e-16 ***
#> OverallCond 6.684e+03 1.143e+03 5.846 6.94e-09 ***
#> YearBuilt 3.330e+02 7.492e+01 4.445 9.81e-06 ***
#> YearRemodAdd 2.024e+02 7.231e+01 2.799 0.005236 **
#> RoofStyle -8.664e+02 1.912e+03 -0.453 0.650469
#> RoofMatl 3.548e+03 2.842e+03 1.248 0.212186
#> Exterior1st -2.247e+03 6.128e+02 -3.666 0.000260 ***
#> Exterior2nd 1.268e+03 5.999e+02 2.113 0.034883 *
#> MasVnrType 3.244e+03 1.358e+03 2.389 0.017088 *
#> MasVnrArea 3.127e+01 7.620e+00 4.104 4.41e-05 ***
#> ExterQual 5.809e+03 2.185e+03 2.659 0.007980 **
#> ExterCond -6.560e+02 2.293e+03 -0.286 0.774889
#> Foundation -1.975e+03 1.439e+03 -1.372 0.170314
#> BsmtQual 3.406e+03 1.414e+03 2.410 0.016161 *
#> BsmtCond -4.149e+02 1.410e+03 -0.294 0.768575
#> BsmtExposure 4.595e+03 9.043e+02 5.081 4.52e-07 ***
#> BsmtFinType1 -7.056e+02 7.072e+02 -0.998 0.318674
#> BsmtFinSF1 3.748e+01 5.305e+00 7.065 3.11e-12 ***
#> BsmtFinType2 -7.229e+02 1.091e+03 -0.663 0.507630
#> BsmtFinSF2 2.890e+01 9.445e+00 3.060 0.002274 **
#> BsmtUnfSF 2.033e+01 4.942e+00 4.114 4.23e-05 ***
#> Heating 2.993e+03 3.912e+03 0.765 0.444404
#> HeatingQC -3.033e+03 1.298e+03 -2.337 0.019625 *
#> CentralAir 4.238e+03 5.063e+03 0.837 0.402785
#> Electrical 1.158e+03 1.935e+03 0.599 0.549646
#> `1stFlrSF` 4.770e+01 6.367e+00 7.492 1.55e-13 ***
#> `2ndFlrSF` 4.719e+01 5.478e+00 8.615 < 2e-16 ***
#> LowQualFinSF 3.766e+01 1.922e+01 1.960 0.050276 .
#> BsmtFullBath 3.015e+03 2.726e+03 1.106 0.269013
#> BsmtHalfBath -1.583e+03 4.232e+03 -0.374 0.708412
#> FullBath 3.840e+03 2.932e+03 1.309 0.190706
#> HalfBath 2.834e+03 2.709e+03 1.046 0.295724
#> BedroomAbvGr -6.920e+03 1.788e+03 -3.870 0.000116 ***
#> KitchenAbvGr -2.144e+04 5.518e+03 -3.885 0.000109 ***
#> KitchenQual 8.419e+03 1.674e+03 5.030 5.87e-07 ***
#> TotRmsAbvGrd 3.792e+03 1.286e+03 2.949 0.003270 **
#> Functional -5.979e+03 1.185e+03 -5.046 5.41e-07 ***
#> Fireplaces 4.213e+03 2.422e+03 1.739 0.082312 .
#> FireplaceQu 9.087e+02 1.321e+03 0.688 0.491702
#> GarageType 1.555e+03 1.200e+03 1.296 0.195144
#> GarageYrBlt 3.340e+00 5.133e+00 0.651 0.515338
#> GarageFinish 1.756e+03 1.361e+03 1.290 0.197236
#> GarageCars 5.865e+03 3.066e+03 1.913 0.056066 .
#> GarageArea 2.649e+01 9.994e+00 2.650 0.008181 **
#> GarageQual 4.982e+03 2.843e+03 1.752 0.080033 .
#> GarageCond -3.004e+03 2.347e+03 -1.280 0.200830
#> PavedDrive 4.637e+02 2.976e+03 0.156 0.876210
#> WoodDeckSF 1.627e+01 8.232e+00 1.976 0.048448 *
#> OpenPorchSF 1.075e+00 1.527e+01 0.070 0.943895
#> EnclosedPorch 3.943e+01 1.744e+01 2.261 0.023961 *
#> `3SsnPorch` 6.032e+01 4.144e+01 1.455 0.145874
#> ScreenPorch 3.843e+01 1.766e+01 2.176 0.029770 *
#> PoolArea 1.971e+01 2.987e+01 0.660 0.509643
#> Fence -2.273e+03 1.157e+03 -1.965 0.049758 *
#> MiscVal 1.056e+00 1.650e+00 0.640 0.522259
#> MoSold -2.625e+02 3.501e+02 -0.750 0.453577
#> YrSold 5.353e+02 7.241e+02 0.739 0.459982
#> SaleType 2.102e+03 1.113e+03 1.889 0.059217 .
#> SaleCondition 1.419e+03 1.250e+03 1.135 0.256781
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for gaussian family taken to be 816314422)
#>
#> Null deviance: 6.3448e+12 on 1023 degrees of freedom
#> Residual deviance: 7.7550e+11 on 950 degrees of freedom
#> AIC: 23992
#>
#> Number of Fisher Scoring iterations: 2
predictions <- lf$predict(df = xtest)
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
#> ifelse(type == : prediction from a rank-deficient fit may be misleading
rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 43857.05
Lasso Regression
lf <- LMTrainer$new(family = "gaussian", alpha=1, lambda = 1000)
lf$fit(X = xtrain, y = "SalePrice")
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 49210.68
Ridge Regression
lf <- LMTrainer$new(family = "gaussian", alpha=0)
lf$fit(X = xtrain, y = "SalePrice")
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 49740.23
Logistic Regression with CV
lf <- LMTrainer$new(family = "gaussian")
lf$cv_model(X = xtrain, y = 'SalePrice', nfolds = 5, parallel = FALSE)
#> Computation done.
predictions <- lf$cv_predict(df = xtest)
coefs <- lf$get_importance()
rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 42703.15
Random Forest
rf <- RFTrainer$new(n_estimators = 500,classification = 0)
rf$fit(X = xtrain, y = "SalePrice")
pred <- rf$predict(df = xtest)
rf$get_importance()
#> tmp.order.tmp..decreasing...TRUE..
#> OverallQual 833522821412
#> GarageCars 511723048043
#> 1stFlrSF 487579913708
#> GarageArea 476300688798
#> YearBuilt 334600066921
#> GarageYrBlt 289720646078
#> BsmtQual 243160542356
#> FullBath 235944573775
#> BsmtFinSF1 226109516264
#> LotArea 190775857067
#> TotRmsAbvGrd 181715819545
#> ExterQual 164478784067
#> 2ndFlrSF 158445105373
#> YearRemodAdd 156415339696
#> FireplaceQu 154146120124
#> MasVnrArea 151111410362
#> KitchenQual 145425099126
#> Fireplaces 132307912524
#> Foundation 84719540738
#> LotFrontage 83223657687
#> OpenPorchSF 75956468846
#> BsmtUnfSF 68403366235
#> BsmtFinType1 58318976374
#> WoodDeckSF 52193697598
#> MoSold 50892512614
#> Neighborhood 47969974511
#> GarageType 44714661421
#> BedroomAbvGr 41177535283
#> Exterior2nd 37101388425
#> OverallCond 36263108617
#> MSSubClass 36195306670
#> BsmtExposure 34226511518
#> HeatingQC 32453111883
#> HalfBath 31636584989
#> Exterior1st 30180400616
#> MasVnrType 28031581303
#> RoofStyle 26991326319
#> HouseStyle 25414867454
#> GarageFinish 24699970434
#> RoofMatl 22817636141
#> BsmtFullBath 22341583078
#> LotShape 21448628218
#> YrSold 20978233868
#> MSZoning 19494522596
#> LandContour 18220837783
#> SaleCondition 14891216640
#> EnclosedPorch 13536859215
#> ScreenPorch 13474805603
#> BldgType 13464188196
#> BsmtHalfBath 12944253127
#> GarageQual 12273757801
#> Condition1 11647247111
#> CentralAir 11547411940
#> LandSlope 11500933603
#> SaleType 11394049360
#> GarageCond 10978456317
#> LotConfig 9268886861
#> BsmtFinSF2 9085854718
#> BsmtFinType2 6853792279
#> ExterCond 5947208856
#> Functional 5805977081
#> Fence 5637835972
#> BsmtCond 5527721261
#> KitchenAbvGr 5125635948
#> LowQualFinSF 4137899715
#> PavedDrive 4028037666
#> Heating 2819736225
#> Condition2 2784817722
#> Electrical 2170052792
#> 3SsnPorch 1958900857
#> MiscVal 1816695047
#> Street 489177735
#> PoolArea 194172431
#> Utilities 0
rmse(actual = xtest$SalePrice, predicted = pred)
#> [1] 33924.94
Xgboost
xgb <- XGBTrainer$new(objective = "reg:linear"
, n_estimators = 500
, eval_metric = "rmse"
, maximize = F
, learning_rate = 0.1
,max_depth = 6)
xgb$fit(X = xtrain, y = "SalePrice", valid = xtest)
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:178677.218750 val-rmse:179860.609375
#> [51] train-rmse:8639.050781 val-rmse:35170.535156
#> [101] train-rmse:5012.278320 val-rmse:33791.890625
#> [151] train-rmse:3328.215088 val-rmse:33500.906250
#> [201] train-rmse:2106.097656 val-rmse:33373.820312
#> [251] train-rmse:1484.432861 val-rmse:33339.132812
#> [301] train-rmse:1009.036682 val-rmse:33312.246094
#> [351] train-rmse:680.185913 val-rmse:33291.863281
#> [401] train-rmse:505.055695 val-rmse:33283.257812
#> [451] train-rmse:371.380035 val-rmse:33280.609375
#> [500] train-rmse:282.667938 val-rmse:33279.257812
pred <- xgb$predict(xtest)
rmse(actual = xtest$SalePrice, predicted = pred)
#> [1] 33279.26
Grid Search
xgb <- XGBTrainer$new(objective="reg:linear")
gst <-GridSearchCV$new(trainer = xgb,
parameters = list(n_estimators = c(10,50), max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'))
gst$fit(xtrain, "SalePrice")
#> [1] "entering grid search"
#> [1] "In total, 4 models will be trained"
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:140195.515625
#> [10] train-rmse:15211.144531
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:144265.859375
#> [10] train-rmse:16491.615234
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:141077.531250
#> [10] train-rmse:15997.720703
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:140195.515625
#> [50] train-rmse:3664.224121
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:144265.859375
#> [50] train-rmse:4077.007568
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:141077.531250
#> [50] train-rmse:3878.576660
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:140853.265625
#> [10] train-rmse:29302.730469
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:145067.578125
#> [10] train-rmse:32326.396484
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:141824.375000
#> [10] train-rmse:28798.119141
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:140853.265625
#> [50] train-rmse:16483.222656
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:145067.578125
#> [50] train-rmse:17453.232422
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:141824.375000
#> [50] train-rmse:15867.968750
gst$best_iteration()
#> $n_estimators
#> [1] 10
#>
#> $max_depth
#> [1] 5
#>
#> $accuracy_avg
#> [1] 0
#>
#> $accuracy_sd
#> [1] 0
#>
#> $auc_avg
#> [1] NaN
#>
#> $auc_sd
#> [1] NA
Random Search
rf <- RFTrainer$new()
rst <-RandomSearchCV$new(trainer = rf,
parameters = list(n_estimators = c(10,50),
max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'),
n_iter=3)
rst$fit(xtrain, "SalePrice")
#> [1] "In total, 3 models will be trained"
rst$best_iteration()
#> $n_estimators
#> [1] 10
#>
#> $max_depth
#> [1] 5
#>
#> $accuracy_avg
#> [1] 0.009766596
#>
#> $accuracy_sd
#> [1] 0.004482377
#>
#> $auc_avg
#> [1] NaN
#>
#> $auc_sd
#> [1] NA
Here, we will solve a simple binary classification problem (predict people who survived on titanic ship). The idea here is to demonstrate how to use this package to solve classification problems.
Data Preparation
# load class
load('../data/cla_train.rda')
# if the above doesn't work, you can try: load("cla_train.rda")
head(cla_train)
#> PassengerId Survived Pclass
#> 1: 1 0 3
#> 2: 2 1 1
#> 3: 3 1 3
#> 4: 4 1 1
#> 5: 5 0 3
#> 6: 6 0 3
#> Name Sex Age SibSp
#> 1: Braund, Mr. Owen Harris male 22 1
#> 2: Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
#> 3: Heikkinen, Miss. Laina female 26 0
#> 4: Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
#> 5: Allen, Mr. William Henry male 35 0
#> 6: Moran, Mr. James male NA 0
#> Parch Ticket Fare Cabin Embarked
#> 1: 0 A/5 21171 7.2500 S
#> 2: 0 PC 17599 71.2833 C85 C
#> 3: 0 STON/O2. 3101282 7.9250 S
#> 4: 0 113803 53.1000 C123 S
#> 5: 0 373450 8.0500 S
#> 6: 0 330877 8.4583 Q
# split the data
split <- createDataPartition(y = cla_train$Survived,p = 0.7)
xtrain <- cla_train[split$Resample1]
xtest <- cla_train[!split$Resample1]
# encode categorical variables - shorter way
for(c in c('Embarked','Sex','Cabin')){
lbl <- LabelEncoder$new()
lbl$fit(c(xtrain[[c]], xtest[[c]]))
xtrain[[c]] <- lbl$transform(xtrain[[c]])
xtest[[c]] <- lbl$transform(xtest[[c]])
}
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
# impute missing values
xtrain[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]
xtest[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]
# drop these features
to_drop <- c('PassengerId','Ticket','Name')
xtrain <- xtrain[,-c(to_drop), with=F]
xtest <- xtest[,-c(to_drop), with=F]
Now, our data is ready to be served for model training. Let’s do it.
KNN Classification
knn <- KNNTrainer$new(k = 2,prob = T,type = 'class')
knn$fit(train = xtrain, test = xtest, y = 'Survived')
probs <- knn$predict(type = 'prob')
labels <- knn$predict(type='raw')
auc(actual = xtest$Survived, predicted=labels)
#> [1] 0.6637255
Naive Bayes Classification
nb <- NBTrainer$new()
nb$fit(xtrain, 'Survived')
pred <- nb$predict(xtest)
auc(actual = xtest$Survived, predicted=pred)
#> [1] 0.7409982
SVM Classification
#predicts labels
svm <- SVMTrainer$new()
#> [1] "For classification, target variable must be factor type. For regression, target variable must be numeric type."
svm$fit(xtrain, 'Survived')
pred <- svm$predict(xtest)
auc(actual = xtest$Survived, predicted=pred)
#> [1] 0.8447712
Logistic Regression
lf <- LMTrainer$new(family="binomial")
lf$fit(X = xtrain, y = "Survived")
summary(lf$model)
#>
#> Call:
#> stats::glm(formula = f, family = self$family, data = X, weights = self$weights)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -2.5085 -0.5588 -0.4155 0.6339 2.4082
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 1.471013 0.629914 2.335 0.019530 *
#> Pclass -0.892178 0.184012 -4.848 1.24e-06 ***
#> Sex 2.838180 0.242342 11.711 < 2e-16 ***
#> Age -0.036018 0.009574 -3.762 0.000169 ***
#> SibSp -0.177886 0.129302 -1.376 0.168903
#> Parch -0.424609 0.167448 -2.536 0.011220 *
#> Fare 0.000997 0.002600 0.383 0.701399
#> Cabin 0.014729 0.004721 3.120 0.001808 **
#> Embarked 0.074928 0.140586 0.533 0.594051
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 831.52 on 623 degrees of freedom
#> Residual deviance: 539.22 on 615 degrees of freedom
#> AIC: 557.22
#>
#> Number of Fisher Scoring iterations: 5
predictions <- lf$predict(df = xtest)
auc(actual = xtest$Survived, predicted = predictions)
#> [1] 0.8123292
Lasso Logistic Regression
lf <- LMTrainer$new(family="binomial", alpha=1)
lf$cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
#> Computation done.
pred <- lf$cv_predict(df = xtest)
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.8060903
Ridge Logistic Regression
lf <- LMTrainer$new(family="binomial", alpha=0)
lf$cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
#> Computation done.
pred <- lf$cv_predict(df = xtest)
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.8000297
Random Forest
rf <- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 3)
rf$fit(X = xtrain, y = "Survived")
pred <- rf$predict(df = xtest)
rf$get_importance()
#> tmp.order.tmp..decreasing...TRUE..
#> Sex 75.459878
#> Fare 53.442346
#> Age 44.549006
#> Cabin 29.467851
#> Pclass 23.635666
#> SibSp 10.072035
#> Parch 8.827820
#> Embarked 7.384991
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.8262032
Xgboost
xgb <- XGBTrainer$new(objective = "binary:logistic"
, n_estimators = 500
, eval_metric = "auc"
, maximize = T
, learning_rate = 0.1
,max_depth = 6)
xgb$fit(X = xtrain, y = "Survived", valid = xtest)
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-auc:0.879905 val-auc:0.852169
#> [51] train-auc:0.970513 val-auc:0.847950
#> [101] train-auc:0.984288 val-auc:0.849733
#> [151] train-auc:0.989437 val-auc:0.852941
#> [201] train-auc:0.993094 val-auc:0.855377
#> [251] train-auc:0.995426 val-auc:0.854813
#> [301] train-auc:0.996739 val-auc:0.854159
#> [351] train-auc:0.997488 val-auc:0.855051
#> [401] train-auc:0.997998 val-auc:0.855110
#> [451] train-auc:0.998280 val-auc:0.854813
#> [500] train-auc:0.998486 val-auc:0.855229
pred <- xgb$predict(xtest)
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.8552288
Grid Search
xgb <- XGBTrainer$new(objective="binary:logistic")
gst <-GridSearchCV$new(trainer = xgb,
parameters = list(n_estimators = c(10,50),
max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'))
gst$fit(xtrain, "Survived")
#> [1] "entering grid search"
#> [1] "In total, 4 models will be trained"
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.144231
#> [10] train-error:0.115385
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.108173
#> [10] train-error:0.084135
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.141827
#> [10] train-error:0.108173
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.144231
#> [50] train-error:0.038462
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.108173
#> [50] train-error:0.045673
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.141827
#> [50] train-error:0.040865
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.213942
#> [10] train-error:0.175481
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.177885
#> [10] train-error:0.134615
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.209135
#> [10] train-error:0.165865
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.213942
#> [50] train-error:0.115385
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.177885
#> [50] train-error:0.100962
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.209135
#> [50] train-error:0.125000
gst$best_iteration()
#> $n_estimators
#> [1] 10
#>
#> $max_depth
#> [1] 5
#>
#> $accuracy_avg
#> [1] 0
#>
#> $accuracy_sd
#> [1] 0
#>
#> $auc_avg
#> [1] 0.8450953
#>
#> $auc_sd
#> [1] 0.05599903
Random Search
rf <- RFTrainer$new()
rst <-RandomSearchCV$new(trainer = rf,
parameters = list(n_estimators = c(10,50),
max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'),
n_iter = 3)
rst$fit(xtrain, "Survived")
#> [1] "In total, 3 models will be trained"
rst$best_iteration()
#> $n_estimators
#> [1] 10
#>
#> $max_depth
#> [1] 2
#>
#> $accuracy_avg
#> [1] 0.8060897
#>
#> $accuracy_sd
#> [1] 0.01943006
#>
#> $auc_avg
#> [1] 0.7810661
#>
#> $auc_sd
#> [1] 0.0187002
Let’s create some new feature based on target variable using target encoding and test a model.
# add target encoding features
xtrain[, feat_01 := smoothMean(train_df = xtrain,
test_df = xtest,
colname = "Embarked",
target = "Survived")$train[[2]]]
xtest[, feat_01 := smoothMean(train_df = xtrain,
test_df = xtest,
colname = "Embarked",
target = "Survived")$test[[2]]]
# train a random forest
# Random Forest
rf <- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 4)
rf$fit(X = xtrain, y = "Survived")
pred <- rf$predict(df = xtest)
rf$get_importance()
#> tmp.order.tmp..decreasing...TRUE..
#> Sex 76.938777
#> Fare 58.064138
#> Age 48.804344
#> Cabin 30.301633
#> Pclass 24.581820
#> SibSp 9.661118
#> Parch 7.962772
#> Embarked 4.554117
#> feat_01 4.550936
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.8262032