Part 2. Regularization for analysis of high-dimensional genomic data



Schedule:

Part Title Topics
II Regularization Variable selection in high-dimensional data
    Regularization procedures
    “glmnet”
    “The Lasso”
        - Solution path
        - Cross-validation
        - Variable selection
        - Comparison with univariate analysis
        - Prediction
    Elastic-net
        - Cross-validation for two tuning parameters
    Covariate-adjusted model


Instructors:

Kipoong Kim



0. Setup

0-1. Installing and calling R packages

install.packages("glmnet")
install.packages("qqman")
library(glmnet)
library(qqman)


0-2. Importing the dataset

load("[Data]PNU-winter-workshop-2019.RData")
ls()
str(workshop.data)
attach(workshop.data)


0-3. Descriptions of data

str(x)
str(y)
str(ref)


0-4. Elements of glmnet

ls("package:glmnet")


0-5. Help pages for more details

help(glmnet)
vignette("glmnet_beta", package="glmnet")


1. The Lasso

1-1. Fitting the lasso model with alpha=1

fit.lasso <- glmnet(x, y, alpha=1, nlambda=10, family="gaussian")
fit.lasso


1-2. Solution path for lasso

plot( fit.lasso, xvar="norm", label=TRUE )


1-3. Getting coefficients for the lasso regression

dim( fit.lasso$beta )
head( fit.lasso$beta )


1-4. Other functions to get coefficients

beta_coef <- coef( fit.lasso, s = fit.lasso$lambda[5] )
beta_pred <- predict( fit.lasso, type="coef", s = fit.lasso$lambda[5] )


1-5. Comparison betwwen three approaches to retrieve coefficients

tmp <- c(33, 54, 192, 350, 361, 435)
data.frame(
  fit.lasso$beta[tmp, 5],
  beta_coef[tmp+1,],
  beta_pred[tmp+1,]
)


1-6. Coefficients in solution path for the Lasso

plot(fit.lasso, xvar="lambda", label=TRUE)
abline(v=log(116.7), cex=2, col="red")


2. Cross-validation

2-1. Selecting the tuning parameter using cross-validation

cv.lasso <- cv.glmnet(x, y, alpha=1, nlambda=100, family="gaussian", type.measure = "mse")
str(cv.lasso, max.level = 1)


2-2. CV curve for cv.glmnet

plot(cv.lasso)


2-3. The optimal lambda value

wh.1se <- min( which( cv.lasso$cvm < cv.lasso$cvup[which.min(cv.lasso$cvm)] ))
cv.lasso$lambda[ wh.1se ]
cv.lasso$lambda.1se


2-4. The Lasso models with the optimal lambda values ()

fit.lasso.min <- glmnet(x, y, alpha=1, family="gaussian", lambda=cv.lasso$lambda.min)
fit.lasso.1se <- glmnet(x, y, alpha=1, family="gaussian", lambda=cv.lasso$lambda.1se)
fit.lasso.min
fit.lasso.1se


2-5. Our final Lasso model with

lasso.nonzero.min <- which( fit.lasso.min$beta != 0 )
str(lasso.nonzero.min)


3. Visualization in Manhattan plot

3-1. P-values for univariate analysis

summary( lm(y~x[, 1]) )

lm.pvalue <- NULL
for( j in 1:ncol(x) ){
  lm.coef <- summary( lm(y~x[, j]) )$coef
  if( nrow(lm.coef)==2 ){
    lm.pvalue[j] <- lm.coef[2, 4]
  } else if ( nrow(lm.coef)==1 ){
    lm.pvalue[j] <- NA
  }
}

str( lm.pvalue )
summary( lm.pvalue )


3-2. Treating the missing values

na.lm.pvalue <- which(is.na(lm.pvalue))
table( x[, na.lm.pvalue[1]] )
  • NA is from the variables which have only one value.


3-3. Formatting input data for manhattan plot

df.lm.pvalue <- data.frame( ref, p=lm.pvalue)
colnames(df.lm.pvalue) <- c("SNP", "CHR", "BP", "P")


3-4. Drawing Manhattan plot with green-colored dots (variables selected by the Lasso)

bonferroni.cutoff <- 0.05/ncol(x) # Bonferroni significant line
qqman::manhattan(df.lm.pvalue,
                 chr="CHR", bp="BP", p="P", snp="SNP",
                 suggestiveline=-log10(bonferroni.cutoff),
                 highlight=df.lm.pvalue[lasso.nonzero.min,"SNP"],
                 ylim=c(0,10))


4. Prediction

4-1. Split training and test set

set.seed(20190130)
idx.train <- sample(nrow(x), 0.9*nrow(x))
idx.test <- (1:nrow(x))[-idx.train]


4-2. Training a regularization model in which a tuning paramter is selected using validation set

cv.lasso.train <- cv.glmnet(x[idx.train,], y[idx.train], family="gaussian",
                            alpha=1, type.measure="mse")

fit.lasso.train.min <- glmnet(x[idx.train,], y[idx.train], family="gaussian",
                              alpha=1, lambda=cv.lasso.train$lambda.min)
fit.lasso.train.1se <- glmnet(x[idx.train,], y[idx.train], family="gaussian",
                              alpha=1, lambda=cv.lasso.train$lambda.1se)
fit.lasso.train.min
fit.lasso.train.1se


4-3. Predicting the new data

newx <- x[idx.test, ]
newy.fit.min <- predict( fit.lasso.train.min, newx, type="response" )

head(
  data.frame(
    test.y=y[idx.test],
    fit.newy.min=as.numeric(newy.fit.min)
  )
)
  • Notification
    Since regularization procedures are not prediction model, we recommend “randomforest” or “boosting model” for prediction






5. Elastic-net with alpha in [0, 1]

5-1. Generating binary response “y2”

y2 <- ifelse( y > median(y), 1, 0 )
table(y2)


5-2. Fitting elastic-net model

fit.enet <- glmnet(x, y2, alpha=0.5, nlambda=10, family="binomial")
fit.enet


6. Selecting the tuning parameters ()

6-1. Fixing subsamples with “foldid”

set.seed(1234)
foldid <- sample(rep(1:10, length=length(y)))
table(foldid)


6-2. Cross-validation for two tuning parameters with “dev” measure

alpha.grid <- seq(0, 1, 0.1)
cv.enet <- as.list( 1:length(alpha.grid) )
for( h in 1:length(alpha.grid) ){
  cv.enet[[h]] <- cv.glmnet(x, y2, family="binomial", type.measure="dev",
                            alpha=alpha.grid[h], nlambda=15,
                            foldid=foldid)
}


6-3. Combining the results of cross-validation according to .

cv.enet.cvm <- NULL
for( h in 1:length(cv.enet)){
  cv.enet.cvm <- cbind(cv.enet.cvm, cv.enet[[h]]$cvm)
}
dimnames(cv.enet.cvm) <- list( paste0("lambda.",1:14), paste0("alpha.",1:9) )


6-4. Finding the optimal tuning parameters with two approaches ()

opt.tune <- which( cv.enet.cvm == min(cv.enet.cvm), arr.ind=TRUE )
opt.tune

opt.alpha <- alpha.grid[ opt.tune[2] ]
opt.lambda.min <- cv.enet[[ opt.tune[2] ]]$lambda.min
opt.lambda.1se <- cv.enet[[ opt.tune[2] ]]$lambda.1se
c(opt.alpha=opt.alpha, opt.lambda.min=opt.lambda.min, opt.lambda.1se=opt.lambda.1se)


6-5. Further grid search for the alpha less than 0.1

alpha.grid2 <- seq(0.01, 0.1, 0.02)
cv.enet2 <- as.list( 1:length(alpha.grid2) )
for( h in 1:length(alpha.grid2) ){
  cv.enet2[[h]] <- cv.glmnet(x, y2, family="binomial", type.measure="dev",
                             alpha=alpha.grid2[h], nlambda=15,
                             foldid=foldid)
}


6-6. Determining whether to view the detail ranges.

cv.enet.cvm2 <- NULL
for( h in 1:length(cv.enet2)){
  cv.enet.cvm2 <- cbind(cv.enet.cvm2, cv.enet2[[h]]$cvm)
}
dimnames(cv.enet.cvm2) <- list( paste0("lambda.",1:14), paste0("alpha.",1:5) )

opt.tune2 <- which( cv.enet.cvm2 == min(cv.enet.cvm2), arr.ind=TRUE )
opt.tune2

opt.alpha2 <- alpha.grid2[ opt.tune2[2] ]
opt.lambda.min2 <- cv.enet2[[ opt.tune2[2] ]]$lambda.min
opt.lambda.1se2 <- cv.enet2[[ opt.tune2[2] ]]$lambda.1se
c(opt.alpha2=opt.alpha2, opt.lambda.min2=opt.lambda.min2, opt.lambda.1se2=opt.lambda.1se2)
  • Since and are also small value enough to have many variables, we stop here.


6-7. The optimal elastic-net models with .

fit.enet.opt.min <- glmnet(x, y2, family="binomial", alpha=opt.alpha, lambda=opt.lambda.min)
fit.enet.opt.1se <- glmnet(x, y2, family="binomial", alpha=opt.alpha, lambda=opt.lambda.1se)
fit.enet.opt.min2 <- glmnet(x, y2, family="binomial", alpha=opt.alpha2, lambda=opt.lambda.min2)
fit.enet.opt.1se2 <- glmnet(x, y2, family="binomial", alpha=opt.alpha2, lambda=opt.lambda.1se2)

fit.enet.opt.min
fit.enet.opt.min2
min(cv.enet.cvm)
min(cv.enet.cvm2)
  • We choose the values of opt.alpha () and opt.lambda.min ( corresponding to ) as the the optimal tuning paramters


7. Manhattan plot with the significant variables of the optimal elastic-net model (“fit.enet.opt.min”)

nonzero.enet.opt <- which( fit.enet.opt$beta != 0 )
bonferroni.cutoff <- 0.05/ncol(x)
qqman::manhattan(df.lm.pvalue,
                 chr="CHR", bp="BP", p="P", snp="SNP",
                 suggestiveline=-log10(bonferroni.cutoff),
                 highlight=df.lm.pvalue[nonzero.enet.opt,"SNP"],
                 ylim=c(0,10))


8. Covariate-adjusted model

  • If we consider the variable known to be associated with response variable, we involve them as covariates in a model.


8-1. Generating arbitrary covariates

cov1 <- NULL
cov1[y2==0] <- rbinom(sum(y2==0), 1, prob=0.8)
cov1[y2==1] <- rbinom(sum(y2==1), 1, prob=0.4)
table(y2, cov1)

cov2 <- NULL
cov2[y2==0] <- rpois(sum(y2==0), lambda=40)
cov2[y2==1] <- rpois(sum(y2==1), lambda=60)
boxplot(cov2~y2)


8-2. Penalty.factor

x.cov <- cbind(cov1, cov2, x)
pf <- c(0, 0, rep(1,ncol(x)) )
table(pf)

pf.fit <- glmnet(x.cov, y2, family="binomial", penalty.factor = pf, nlambda=10 )
pf.fit
  • We also adjust the eigen vecotrs or the n-th principle components to acommodate population structure.


#save.image(file="part3.RData")