# Crop 545 - lab 8 # Power and FDR # Y.Song Feb 21, 2018 # 000000000000000000000000000000000000000000000 # # 0. Set things up ---- # # 000000000000000000000000000000000000000000000 rm(list=ls()) # remove any object in the 'Environment' # setwd( ) # Please set working directory as need # The compiler library is required for the cmpfun function # It should be already installed. If not, please run the next line # install.package('compiler') library(compiler) #required for cmpfun # functions will be used for this lab: source("G2P.R") source("GWASbyCor.R") source("http://www.zzlab.net/GAPIT/gapit_functions.txt") # the gapit_functions is a group of functions. # You've been familiar with loading one single function in a file # Notice that this time we sourced the gapit function group, and # you'll see many functions loaded this time. # the example datast myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) # 000000000000000000000000000000000000000000000 # # 1. G2P simulation & GWAS by Corr ---- # # 000000000000000000000000000000000000000000000 # Before we can identify any false positive and perform any power # analysis, we will first simulate a phenometric vector and perform a # GWAS by Correlation analysis, so that we can have something to # analysis. # 1.1 G2P phenotype simulation ------------------------------------ X=myGD[,-1] index1to5=myGM[,2]<6 X1to5 = X[,index1to5] set.seed(99164) mySim=G2P(X= X1to5,h2=.75,alpha=1,NQTN=10,distribution="norm") # notice that we only simulated with QNT's on 1st-5th chromosome # 1.2 GWAS by Corr ------------------------------------ p= GWASbyCor(X=X,y=mySim$y) # ignore the warning round(p[1:5], 4) # ?round() # 000000000000000000000000000000000000000000000 # # 2. True-positive & false-positive ---- # (T-pos) (F-pos) # # 000000000000000000000000000000000000000000000 index=order(p) # rank of each p value in ascending order. Please notice order is # different from sort() # ?order() # Example: lala=c(4,2,6,3,7); order(lala) top5=index[1:5] # location of the five most significant top5 p[top5] # p-value of the 5 most significant QTN's # 2.1 find T-pos and F-pos ------------------------------------ # when simulate the phenotype, we know we only assigned 10 QTN's to # contribute to our phenotype (if you are not clear how this was done, # please review lecture/lab for phenotype simulation). We know where # those QTN are (in mySim$QTN.position) # Thus, if the significant QNT's are the ones we assign, it is true-positive. # Otherwise, the significant ones are false-positive detected=intersect(top5,mySim$QTN.position) # ?intersect() detected # Is detected true-positive or false-positive falsePositive=setdiff(top5, mySim$QTN.position) falsePositive # plot the two situation: color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10) m=nrow(myGM) plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]]) abline(v=mySim$QTN.position, lty = 2, lwd=1, col = "black") abline(v= falsePositive, lty = 2, lwd=2, col = "red") # 2.2 p.null ------------------------------------ # some locations we recieved NA values for p-value. First, we want to know # how many NA p values we got sum(is.na(p)) # recall: TRUE=1 and FALSE=0, and we use sum() to count the number of NA valus # count the number of certain even is a common task in data analysis # there're other ways to do so. # Our null hypothesis - a QTN is not correlated with the phenotype. index.null=!index1to5 & !is.na(p) # index.null = on chromosome 1-6 and the p value is not NA p.null=p[index.null] # get p values for the index.null condition m.null=length(p.null) index.sort=order(p.null) # record the rank of p.null p.null.sort=p.null[index.sort] # note: the above two command is equivelent to # p.null.sort = sort(p.null) head(p.null.sort) # smallest p-values tail(p.null.sort) # lagest p-values seq=seq(1:m.null) table=data.frame(seq, p.null.sort, seq/m.null) # third parameter (seq/m.null) is a ratio, similar to the idea of percentile # equivelent to (current rank)/(total length of array) # put the rank, p.null.sort, and seq.m.null into a dataframe table. head(table,5) tail(table,5) plot(table$seq, table$p.null.sort, xlab = "Rank", ylab = "p-value") # the lower the rank is, the smaller the p-value is # 000000000000000000000000000000000000000000000 # # 3. GAPIT for FDR and Power ---- # # 000000000000000000000000000000000000000000000 # 3.1 QTN Bins ------------------------------------ # First we organize the result above and prepare it in the format for GAPIT function # we will code the location of QTN into bins bigNum=1e9; resolution=1e5 bin=round((myGM$Chromosome*bigNum+myGM$Position)/resolution) # QTN's close to each other will be put in the same bin result=cbind(myGM,t(p),bin) head(result) # notice the second to fourth QNT's are all in bin 10029 # because they are close # Get the bin information for the QTN's we used for phenotype # simulation QTN.bin=result[mySim$QTN.position,] QTN.bin # sort QTN.bin dataset by t(p) value index.qtn.p=order(QTN.bin$`t(p)` ) # notive the 't(p)' colun is quoted in the `` sign (the one on the same key as the ~ sign) QTN.bin[index.qtn.p,] # 3.2 GAPIT FDR TypeI ------------------------------------ # We will use GAPIT.FDR.TypeI function to do the task. myStat=GAPIT.FDR.TypeI( WS=c(1e0,1e3,1e4,1e5), # WS = window size GM=myGM, # dataset seqQTN=mySim$QTN.position, # QTN position GWAS=result # result[SNPm Chromosome, Position, t(p), bin]. See Sec.3 QTN Bins ) str(myStat) myStat$FDR # four columns corresponding to four wondow sizes # ten rows corresponding to ten QTN positions par(mfrow=c(1,2),mar = c(5,4,3,2)) plot(myStat$FDR[,1],myStat$Power,type="b", xlab="FDR", ylab="Power") plot(myStat$TypeI[,1],myStat$Power,type="b", xlab="TypeI error", ylab="Power") # 000000000000000000000000000000000000000000000 # # 4. Repeat GAPIT.FDR.TYPEI 1. ---- # # 000000000000000000000000000000000000000000000 # Now, we will repeat the process 100 times # In each loop, we start from G2P simulation, and ends at calculating the TypeI error nrep=100 set.seed(99164) statRep=replicate(nrep, { mySim=G2P(X=myGD[,-1],h2=.75,alpha=1,NQTN=10,distribution="norm") p=p= GWASbyCor(X=myGD[,-1],y=mySim$y) seqQTN=mySim$QTN.position myGWAS=cbind(myGM,t(p),NA) myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position, GWAS=myGWAS,maxOut=100,MaxBP=1e10) }) # ignore the error str(statRep) # Notice: we asked to save everything in the statRep, thus statRep is a very big list. # But it is still a list, we can use index to get the result we are interested in. power=statRep[[2]]# the second list member power # 4.1 Results as number scores ----------------------------------- s.fdr=seq(3,length(statRep),7) # making a number sequence starting from 3 to the number of the length of statRep, with an # interval of 7. The s.fdr is a index fdr=statRep[s.fdr] # The result is 100 fdr sub-lists. Each sub-list corresponding to one replicate in the # statRep loop (beginning of sec.4) # Observe the structure the Sec.3.2 myStat, make sure you understand how we obtain all fdr fdr.mean=Reduce ("+", fdr) / length(fdr) # Use 'Reduce()' function to perform a element wise summation # ?Reduce # Example: # add <- function(x) Reduce("+", x) # add(list(1, 2, 3)) # similarly, get typeI error rate s.tp1=seq(4,length(statRep),7) tp1=statRep[s.tp1] tp1.mean=Reduce ("+", tp1) / length(tp1) # AUC (area under curve) s.auc.fdr=seq(6,length(statRep),7) auc.fdr=statRep[s.auc.fdr] auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr) # 4.2 Results as plots ------------------------------ # ROC curve (Receiver Operating Characteristic) theColor=rainbow(4) plot(fdr.mean[,1],power , type="b", col=theColor [1],xlim=c(0,1)) for(i in 2:ncol(fdr.mean)){ lines(fdr.mean[,i], power , type="b", col= theColor [i]) } # barplot on AUC barplot(auc.fdr.mean, names.arg=c("1bp", "1K", "10K","100K"), xlab="Resolution", ylab="AUC") # 000000000000000000000000000000000000000000000 # # 5. Repeat with varied Heritability ---- # # 000000000000000000000000000000000000000000000 nrep=100 set.seed(99164) #h2=25% statRep25=replicate(nrep, { mySim=G2P(X=myGD[,-1],h2=.25,alpha=1,NQTN=10,distribution="norm") p=p= GWASbyCor(X=myGD[,-1],y=mySim$y) seqQTN=mySim$QTN.position myGWAS=cbind(myGM,t(p),NA) myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position,GWAS=myGWAS,maxOut=100,MaxBP=1e10)}) #h2=75% statRep75=replicate(nrep, { mySim=G2P(X=myGD[,-1],h2=.75,alpha=1,NQTN=10,distribution="norm") p=p= GWASbyCor(X=myGD[,-1],y=mySim$y) seqQTN=mySim$QTN.position myGWAS=cbind(myGM,t(p),NA) myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position,GWAS=myGWAS,maxOut=100,MaxBP=1e10)}) power25=statRep25[[2]] s.t1=seq(4,length(statRep25),7) t1=statRep25[s.t1] t1.mean.25=Reduce ("+", t1) / length(t1) power75=statRep75[[2]] s.t1=seq(4,length(statRep75),7) t1=statRep75[s.t1] t1.mean.75=Reduce ("+", t1) / length(t1) plot(t1.mean.25[,4],power25, type="b", col="blue",xlim=c(0,1)) lines(t1.mean.75[,4], power75, type="b", col= "red")