Dziś

library(mlbench)

data(BostonHousing)

df <- BostonHousing

set.seed(123)
ind.train <- sample(1:nrow(df), size=400, replace=FALSE)
ind.test  <- setdiff(1:nrow(df), ind.train)
df$medv <- ifelse(df$medv < 25, 0, 1)
df$medv <- factor(df$medv, levels=c("0", "1"))

Różne filtry/wrappery

Przykład: informacja wzajemna, CMIM

y <- df$medv
x_discrete <- df$rad
x_continuous <- infotheo::discretize(df$nox)

infotheo::mutinformation(y, x_discrete)
## [1] 0.09452454
infotheo::mutinformation(y, x_continuous)
## [1] 0.09534981
praznik::CMIM(df[,-14], df$medv, k=10)
## $selection
##      rm     nox   lstat ptratio     age   indus     tax      zn     rad     dis 
##       6       5      13      11       7       3      10       2       9       8 
## 
## $score
##         rm        nox      lstat    ptratio        age      indus        tax 
## 0.26395414 0.09855933 0.09807178 0.06759851 0.04250372 0.04098204 0.04051790 
##         zn        rad        dis 
## 0.03725492 0.03248110 0.02487993

Jakieś filtry

więcej można przeczytać tu: https://mlr.mlr-org.com/articles/tutorial/filter_methods.html

task <- mlr::makeClassifTask(data=df, target="medv")
# potrzebne pakiety: FSelectorRcpp, FSelector
fv <- mlr::generateFilterValuesData(task,
                                    method = c("FSelectorRcpp_information.gain", "FSelector_chi.squared"))
mlr::plotFilterValues(fv, filter="FSelectorRcpp_information.gain")

przykład: Boruta

boruta.res <- Boruta::Boruta(medv~., data = df, maxRuns = 20)
print(boruta.res)
## Boruta performed 19 iterations in 3.938433 secs.
##  12 attributes confirmed important: age, b, crim, dis, indus and 7
## more;
##  No attributes deemed unimportant.
##  1 tentative attributes left: chas;
Boruta::getSelectedAttributes(boruta.res, withTentative = F)
##  [1] "crim"    "zn"      "indus"   "nox"     "rm"      "age"     "dis"    
##  [8] "rad"     "tax"     "ptratio" "b"       "lstat"
plot(boruta.res)

przykład: MCFS

# mcfs.res <- rmcfs::mcfs(medv~., data=df)
# png("fs.png")
# plot(mcfs.res)
# dev.off()
# 
# powiazania <- rmcfs::build.idgraph(mcfs.res)
# png("fs2.png")
# plot(powiazania, label_dist = 1)
# dev.off()

Metody zagnieżdżone i krokowe

przykład: GLM

df$chas <- as.numeric(as.character(df$chas))
glm.lasso <- glmnet::glmnet(as.matrix(df[,-14]), df$medv, family="binomial")
plot(glm.lasso)

glm.lasso.lambda <- glmnet::glmnet(as.matrix(df[,-14]), df$medv, family="binomial", lambda=0.01)
glm.lasso.lambda$beta
## 13 x 1 sparse Matrix of class "dgCMatrix"
##                  s0
## crim     .         
## zn       .         
## indus   -0.02963303
## chas     0.42042521
## nox      .         
## rm       1.86524858
## age      .         
## dis     -0.17010009
## rad      0.01540324
## tax      .         
## ptratio -0.17842433
## b        .         
## lstat   -0.27107394
df.selected <- df[,c(glm.lasso.lambda$beta[,1]!=0, TRUE)]
glm.mod.sel <- glm(medv~., data=df.selected, family=binomial)
summary(glm.mod.sel)
## 
## Call:
## glm(formula = medv ~ ., family = binomial, data = df.selected)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6331  -0.3340  -0.1165   0.0779   3.3407  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.95818    3.55547  -0.551 0.581804    
## indus       -0.13479    0.04903  -2.749 0.005971 ** 
## chas         0.76793    0.70072   1.096 0.273116    
## rm           1.85489    0.41525   4.467 7.93e-06 ***
## dis         -0.37774    0.10337  -3.654 0.000258 ***
## rad          0.11725    0.03766   3.114 0.001848 ** 
## ptratio     -0.29636    0.09849  -3.009 0.002620 ** 
## lstat       -0.40939    0.06497  -6.301 2.95e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 580.85  on 505  degrees of freedom
## Residual deviance: 238.13  on 498  degrees of freedom
## AIC: 254.13
## 
## Number of Fisher Scoring iterations: 7
glm.mod <- glm(medv~., data=df, family=binomial)
glm.mod.AIC <- step(glm.mod, trace = 0)

przykład: LDA

lda.step <- klaR::stepclass(medv~., data=df, method="lda", direction="backward", 
                            criterion="AC", improvement = 0.05, fold=10)
## accuracy: 0.70779;  starting variables (13): crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat 
## accuracy: 0.709;  out: "crim";  variables (12): zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat 
## 
##  hr.elapsed min.elapsed sec.elapsed 
##        0.00        0.00        1.23
plot(lda.step)

sda.mod <- sparseLDA::sda(as.matrix(df[,-14]), df$medv)
sda.mod
## 
## Call:
## sda.default(x = as.matrix(df[, -14]), y = df$medv)
## 
## lambda = 1e-06 
## stop = 13 variables 
## classes = 0, 1 
## 
## Top 5 predictors (out of 13):
##  b, age, tax, crim, zn