Part 3. Selection probabilities.
Schedule:
Part |
Title |
Topics |
III |
Selection probabilties |
An algorithm of selection probabilities |
|
|
- Setting a grid of tuning parameters |
|
|
- Applying the regularization for subsamples |
|
|
Why split data with 0.5 proportion |
|
|
- Advantages of “subagging” |
|
|
Algorithm summary |
|
|
The stability path |
|
|
Threshold to control the false positive |
|
|
Manhattan plot with selection probabilities |
Instructors:
Kipoong Kim
0. Data setup (equivalent to part2)
library(glmnet)
library(qqman)
load("[Data]PNU-winter-workshop-2019.RData")
attach(workshop.data)
1. An algorithm of selection probabilities
|
Algorithm Selection probabilities with elastic-net |
0: |
Let us assume that a genomic data has samples and variables. |
1: |
For all , |
2: |
for k=1 to K do |
3: |
Subsample with size |
4: |
Compute with regularization model |
5: |
end for |
6: |
|
7: |
|
8: |
return SP= |
2. Calculation of selection probabilities
2-1. Setting a grid of tuning parameters
alpha.grid <- seq(0.1, 0.9, 0.1)
if( FALSE ){
lambda.mat <- array( NA, dim = c(length(alpha.grid), 10, 10),
dimnames=list(paste0("alpha=", alpha.grid),
paste0("lambda", 1:10),
paste0("REP=", 1:10)) )
for( REP in 1:10 ){
for( i in 1:length(alpha.grid) ){
set.seed( 1000*REP + i )
sp.subset <- sample( nrow(x), floor(nrow(x)/2) )
lambda.mat[i,,REP] <- glmnet(x=x[sp.subset,], y=y[sp.subset], family="gaussian",
alpha=alpha.grid[i], nlambda=10)$lambda
}
}
}
# save(lambda.mat, file="[object]lambda.mat.RData")
load("[object]lambda.mat.RData")
lambda.grid <- seq( summary(lambda.mat)[2], summary(lambda.mat)[4], length.out = 100 )
range(lambda.grid)
2-2. Applying regularization with the same subsamples for a grid of tuning paramters.
if( FALSE ){
sp.array <- array(0, dim=c(ncol(x), length(alpha.grid), length(lambda.grid)) )
K <- 50 ## recommended to be 100 or more
for( k in 1:K ){
if( k %% 10 == 0 ) cat("A current iteration is ", k, "\n")
set.seed(123*k)
sp.subset <- sample( nrow(x), floor(nrow(x)/2) )
for( i in 1:length(alpha.grid) ){
for( j in 1:length(lambda.grid) ){
sp.fit <- glmnet(x=x[sp.subset,], y=y[sp.subset], family="gaussian",
alpha=alpha.grid[i], lambda=lambda.grid[j])
sp.array[,i,j] <- sp.array[,i,j] + as.numeric(sp.fit$beta!=0)/K
}
}
}
}
# save(sp.array, file="[object]sp.array.RData")
2-3. Retrieving the selection probabilities.
load("[object]sp.array.RData")
sp <- apply( sp.array, 1, max )
head( data.frame(sp = sort( sp, decreasing=TRUE ), variable = order( sp, decreasing=TRUE ) ) )
2-4. Summary of an algorithm of selection probabilities
# library(glmnet)
# alpha.grid <- seq(0.1, 0.9, 0.1)
#
# lambda.mat <- array( NA, dim = c(length(alpha.grid), 10, 10),
# dimnames=list(paste0("alpha=", alpha.grid), paste0("lambda", 1:10), paste0("REP=", 1:10)) )
# for( REP in 1:10 ){
# for( i in 1:length(alpha.grid) ){
# set.seed( 1000*REP + i )
# sp.subset <- sample( nrow(x), floor(nrow(x)/2) )
# lambda.mat[i,,REP] <- glmnet(x=x[sp.subset,], y=y[sp.subset], family="gaussian", alpha=alpha.grid[i], nlambda=10)$lambda
# }
# }
#
# lambda.grid <- seq( summary(lambda.mat)[2], summary(lambda.mat)[4], length.out = 100 )
# lambda.grid
#
# sp.array <- array(0, dim=c(ncol(x), length(alpha.grid), length(lambda.grid)) )
# K <- 50 ## recommended to be 100 or more
# for( k in 1:K ){
# set.seed(123*k)
# sp.subset <- sample( nrow(x), floor(nrow(x)/2) )
#
# for( i in 1:length(alpha.grid) ){
# for( j in 1:length(lambda.grid) ){
# sp.fit <- glmnet(x=x[sp.subset,], y=y[sp.subset], family="gaussian", alpha=alpha.grid[i], lambda=lambda.grid[j])
# sp.array[,i,j] <- sp.array[,i,j] + as.numeric(sp.fit$beta!=0)/K
# }
# }
# }
#
# sp <- apply( sp.array, 1, max )
3. Threshold() to control the false positive
sp.threshold <- function(sp, FP=5){
if(max(sp)>1 | min(sp)<0) stop("'Sel.prob' should be in [0,1].")
if(length( dim(sp) ) < 3) stop("'sp' should be a 3-dimensional array.")
qhat <- sum(sp)/(dim(sp)[2]*dim(sp)[3])
p <- dim(sp)[1]
threshold <- qhat^2/(2*FP*p) + 0.5
if(threshold>1) threshold <- 1
return( threshold )
}
theoretical.threshold5 <- sp.threshold(sp.array, FP=5)
theoretical.threshold10 <- sp.threshold(sp.array, FP=10)
c(theoretical.threshold5, theoretical.threshold10)
4. Manhattan plot with sel.prob
df.sp <- data.frame( ref, p=sp)
colnames(df.sp) <- c("SNP", "CHR", "BP", "P")
var.theoretical.threshold5 <- which( sp > theoretical.threshold5 )
qqman::manhattan(df.sp,
chr="CHR", bp="BP", p="P", snp="SNP", logp=FALSE,
suggestiveline=c(theoretical.threshold5, theoretical.threshold10),
highlight=df.sp[var.theoretical.threshold5, "SNP"],
ylim=c(0,1))
5. Stability selection
5-1. Computing the selection probabilities of each variable for a specific set of
## Stability selection
stable.lambda <- glmnet(x=x, y=y, family="gaussian", alpha=0.1, nlambda=10)$lambda
if( FALSE ){
stable.sp.out <- array(0, dim=c(ncol(x), length(stable.lambda)) )
K=50
for( k in 1:K ){
if(k%%10==0) cat("A current iteration is ", k, "\n")
set.seed(123*k)
sp.subset <- sample( nrow(x), floor(nrow(x)/2) )
for( j in 1:length(stable.lambda) ){
sp.fit <- glmnet(x=x[sp.subset,], y=y[sp.subset], family="gaussian",
alpha=0.1, lambda=stable.lambda[j])
stable.sp.out[,j] <- stable.sp.out[,j] + as.numeric(sp.fit$beta!=0)/K
}
}
}
# save(stable.lambda, file="[object]stable.lambda.RData")
# save(stable.sp.out, file="[object]stable.sp.out.RData")
5-2. The stability path
load("[object]stable.lambda.RData")
load("[object]stable.sp.out.RData")
var.threshold10 <- which( sp > theoretical.threshold10 )
str(var.threshold10)
matplot( log(stable.lambda), t(stable.sp.out), type="l",
lty=ifelse(1:ncol(x) %in% var.threshold10, 1, 2),
col=ifelse(1:ncol(x) %in% var.threshold10, 2, 1),
xlab="log(lambda)", ylab="Selection probabilities" )
# save.image( file="part4.RData" )
6. Example of sp.gwas
6-1. Set-up
# install.packages("devtools")
library(devtools)
install_github("statpng/sp.gwas")
# update.packages()
# remove.package("pkgname"); install.packages("pkgname")
library(sp.gwas)
6-2. Example1
sp.gwas(genotype.path="./genotype1.csv",
phenotype.path="./phenotype1.csv",
save.path="./Test1",
y.col=4:7,
y.id.col=2,
method="lasso",
family="gaussian"
)
6-3. Example2
sp.gwas(genotype.path="./genotype2.csv",
phenotype.path="./phenotype2.csv",
save.path="./Test2",
y.col=8,
y.id.col=1,
method="lasso",
family="gaussian"
)