Part 2. Regularization for analysis of high-dimensional genomic data
Schedule:
Part | Title | Topics |
---|---|---|
II | Regularization | Variable selection in high-dimensional data |
Regularization procedures | ||
“glmnet” | ||
“The Lasso” | ||
- Solution path | ||
- Cross-validation | ||
- Variable selection | ||
- Comparison with univariate analysis | ||
- Prediction | ||
Elastic-net | ||
- Cross-validation for two tuning parameters | ||
Covariate-adjusted model |
Instructors:
Kipoong Kim
0. Setup
0-1. Installing and calling R packages
install.packages("glmnet")
install.packages("qqman")
library(glmnet)
library(qqman)
0-2. Importing the dataset
load("[Data]PNU-winter-workshop-2019.RData")
ls()
str(workshop.data)
attach(workshop.data)
0-3. Descriptions of data
str(x)
str(y)
str(ref)
0-4. Elements of glmnet
ls("package:glmnet")
0-5. Help pages for more details
help(glmnet)
vignette("glmnet_beta", package="glmnet")
1. The Lasso
1-1. Fitting the lasso model with alpha=1
fit.lasso <- glmnet(x, y, alpha=1, nlambda=10, family="gaussian")
fit.lasso
1-2. Solution path for lasso
plot( fit.lasso, xvar="norm", label=TRUE )
1-3. Getting coefficients for the lasso regression
dim( fit.lasso$beta )
head( fit.lasso$beta )
1-4. Other functions to get coefficients
beta_coef <- coef( fit.lasso, s = fit.lasso$lambda[5] )
beta_pred <- predict( fit.lasso, type="coef", s = fit.lasso$lambda[5] )
1-5. Comparison betwwen three approaches to retrieve coefficients
tmp <- c(33, 54, 192, 350, 361, 435)
data.frame(
fit.lasso$beta[tmp, 5],
beta_coef[tmp+1,],
beta_pred[tmp+1,]
)
1-6. Coefficients in solution path for the Lasso
plot(fit.lasso, xvar="lambda", label=TRUE)
abline(v=log(116.7), cex=2, col="red")
2. Cross-validation
2-1. Selecting the tuning parameter using cross-validation
cv.lasso <- cv.glmnet(x, y, alpha=1, nlambda=100, family="gaussian", type.measure = "mse")
str(cv.lasso, max.level = 1)
2-2. CV curve for cv.glmnet
plot(cv.lasso)
2-3. The optimal lambda value
wh.1se <- min( which( cv.lasso$cvm < cv.lasso$cvup[which.min(cv.lasso$cvm)] ))
cv.lasso$lambda[ wh.1se ]
cv.lasso$lambda.1se
2-4. The Lasso models with the optimal lambda values ()
fit.lasso.min <- glmnet(x, y, alpha=1, family="gaussian", lambda=cv.lasso$lambda.min)
fit.lasso.1se <- glmnet(x, y, alpha=1, family="gaussian", lambda=cv.lasso$lambda.1se)
fit.lasso.min
fit.lasso.1se
2-5. Our final Lasso model with
lasso.nonzero.min <- which( fit.lasso.min$beta != 0 )
str(lasso.nonzero.min)
3. Visualization in Manhattan plot
3-1. P-values for univariate analysis
summary( lm(y~x[, 1]) )
lm.pvalue <- NULL
for( j in 1:ncol(x) ){
lm.coef <- summary( lm(y~x[, j]) )$coef
if( nrow(lm.coef)==2 ){
lm.pvalue[j] <- lm.coef[2, 4]
} else if ( nrow(lm.coef)==1 ){
lm.pvalue[j] <- NA
}
}
str( lm.pvalue )
summary( lm.pvalue )
3-2. Treating the missing values
na.lm.pvalue <- which(is.na(lm.pvalue))
table( x[, na.lm.pvalue[1]] )
- NA is from the variables which have only one value.
3-3. Formatting input data for manhattan plot
df.lm.pvalue <- data.frame( ref, p=lm.pvalue)
colnames(df.lm.pvalue) <- c("SNP", "CHR", "BP", "P")
3-4. Drawing Manhattan plot with green-colored dots (variables selected by the Lasso)
bonferroni.cutoff <- 0.05/ncol(x) # Bonferroni significant line
qqman::manhattan(df.lm.pvalue,
chr="CHR", bp="BP", p="P", snp="SNP",
suggestiveline=-log10(bonferroni.cutoff),
highlight=df.lm.pvalue[lasso.nonzero.min,"SNP"],
ylim=c(0,10))
4. Prediction
4-1. Split training and test set
set.seed(20190130)
idx.train <- sample(nrow(x), 0.9*nrow(x))
idx.test <- (1:nrow(x))[-idx.train]
4-2. Training a regularization model in which a tuning paramter is selected using validation set
cv.lasso.train <- cv.glmnet(x[idx.train,], y[idx.train], family="gaussian",
alpha=1, type.measure="mse")
fit.lasso.train.min <- glmnet(x[idx.train,], y[idx.train], family="gaussian",
alpha=1, lambda=cv.lasso.train$lambda.min)
fit.lasso.train.1se <- glmnet(x[idx.train,], y[idx.train], family="gaussian",
alpha=1, lambda=cv.lasso.train$lambda.1se)
fit.lasso.train.min
fit.lasso.train.1se
4-3. Predicting the new data
newx <- x[idx.test, ]
newy.fit.min <- predict( fit.lasso.train.min, newx, type="response" )
head(
data.frame(
test.y=y[idx.test],
fit.newy.min=as.numeric(newy.fit.min)
)
)
- Notification
Since regularization procedures are not prediction model, we recommend “randomforest” or “boosting model” for prediction
5. Elastic-net with alpha in [0, 1]
5-1. Generating binary response “y2”
y2 <- ifelse( y > median(y), 1, 0 )
table(y2)
5-2. Fitting elastic-net model
fit.enet <- glmnet(x, y2, alpha=0.5, nlambda=10, family="binomial")
fit.enet
6. Selecting the tuning parameters ()
6-1. Fixing subsamples with “foldid”
set.seed(1234)
foldid <- sample(rep(1:10, length=length(y)))
table(foldid)
6-2. Cross-validation for two tuning parameters with “dev” measure
alpha.grid <- seq(0, 1, 0.1)
cv.enet <- as.list( 1:length(alpha.grid) )
for( h in 1:length(alpha.grid) ){
cv.enet[[h]] <- cv.glmnet(x, y2, family="binomial", type.measure="dev",
alpha=alpha.grid[h], nlambda=15,
foldid=foldid)
}
6-3. Combining the results of cross-validation according to .
cv.enet.cvm <- NULL
for( h in 1:length(cv.enet)){
cv.enet.cvm <- cbind(cv.enet.cvm, cv.enet[[h]]$cvm)
}
dimnames(cv.enet.cvm) <- list( paste0("lambda.",1:14), paste0("alpha.",1:9) )
6-4. Finding the optimal tuning parameters with two approaches ()
opt.tune <- which( cv.enet.cvm == min(cv.enet.cvm), arr.ind=TRUE )
opt.tune
opt.alpha <- alpha.grid[ opt.tune[2] ]
opt.lambda.min <- cv.enet[[ opt.tune[2] ]]$lambda.min
opt.lambda.1se <- cv.enet[[ opt.tune[2] ]]$lambda.1se
c(opt.alpha=opt.alpha, opt.lambda.min=opt.lambda.min, opt.lambda.1se=opt.lambda.1se)
6-5. Further grid search for the alpha less than 0.1
alpha.grid2 <- seq(0.01, 0.1, 0.02)
cv.enet2 <- as.list( 1:length(alpha.grid2) )
for( h in 1:length(alpha.grid2) ){
cv.enet2[[h]] <- cv.glmnet(x, y2, family="binomial", type.measure="dev",
alpha=alpha.grid2[h], nlambda=15,
foldid=foldid)
}
6-6. Determining whether to view the detail ranges.
cv.enet.cvm2 <- NULL
for( h in 1:length(cv.enet2)){
cv.enet.cvm2 <- cbind(cv.enet.cvm2, cv.enet2[[h]]$cvm)
}
dimnames(cv.enet.cvm2) <- list( paste0("lambda.",1:14), paste0("alpha.",1:5) )
opt.tune2 <- which( cv.enet.cvm2 == min(cv.enet.cvm2), arr.ind=TRUE )
opt.tune2
opt.alpha2 <- alpha.grid2[ opt.tune2[2] ]
opt.lambda.min2 <- cv.enet2[[ opt.tune2[2] ]]$lambda.min
opt.lambda.1se2 <- cv.enet2[[ opt.tune2[2] ]]$lambda.1se
c(opt.alpha2=opt.alpha2, opt.lambda.min2=opt.lambda.min2, opt.lambda.1se2=opt.lambda.1se2)
- Since and are also small value enough to have many variables, we stop here.
6-7. The optimal elastic-net models with .
fit.enet.opt.min <- glmnet(x, y2, family="binomial", alpha=opt.alpha, lambda=opt.lambda.min)
fit.enet.opt.1se <- glmnet(x, y2, family="binomial", alpha=opt.alpha, lambda=opt.lambda.1se)
fit.enet.opt.min2 <- glmnet(x, y2, family="binomial", alpha=opt.alpha2, lambda=opt.lambda.min2)
fit.enet.opt.1se2 <- glmnet(x, y2, family="binomial", alpha=opt.alpha2, lambda=opt.lambda.1se2)
fit.enet.opt.min
fit.enet.opt.min2
min(cv.enet.cvm)
min(cv.enet.cvm2)
- We choose the values of opt.alpha () and opt.lambda.min ( corresponding to ) as the the optimal tuning paramters
7. Manhattan plot with the significant variables of the optimal elastic-net model (“fit.enet.opt.min”)
nonzero.enet.opt <- which( fit.enet.opt$beta != 0 )
bonferroni.cutoff <- 0.05/ncol(x)
qqman::manhattan(df.lm.pvalue,
chr="CHR", bp="BP", p="P", snp="SNP",
suggestiveline=-log10(bonferroni.cutoff),
highlight=df.lm.pvalue[nonzero.enet.opt,"SNP"],
ylim=c(0,10))
8. Covariate-adjusted model
- If we consider the variable known to be associated with response variable, we involve them as covariates in a model.
8-1. Generating arbitrary covariates
cov1 <- NULL
cov1[y2==0] <- rbinom(sum(y2==0), 1, prob=0.8)
cov1[y2==1] <- rbinom(sum(y2==1), 1, prob=0.4)
table(y2, cov1)
cov2 <- NULL
cov2[y2==0] <- rpois(sum(y2==0), lambda=40)
cov2[y2==1] <- rpois(sum(y2==1), lambda=60)
boxplot(cov2~y2)
8-2. Penalty.factor
x.cov <- cbind(cov1, cov2, x)
pf <- c(0, 0, rep(1,ncol(x)) )
table(pf)
pf.fit <- glmnet(x.cov, y2, family="binomial", penalty.factor = pf, nlambda=10 )
pf.fit
- We also adjust the eigen vecotrs or the n-th principle components to acommodate population structure.
#save.image(file="part3.RData")