# CROPS545_Lecture23_CV # Zhiwu Zhang April 3, 2016 # Yuanhong Song modified and commented 4/18/2018 # Set up ---- rm(list=ls()) main="/Users/song/Dropbox/COURSE/CROP_545/2018/labs/lab14" setwd(main) # # Import GAPIT # source("http://www.bioconductor.org/biocLite.R") # biocLite("multtest") # install.packages("gplots") # install.packages("scatterplot3d")#The downloaded link at: http://cran.r-project.org/package=scatterplot3d library('MASS') # required for ginv library(multtest) library(gplots) library(compiler) #required for cmpfun library("scatterplot3d") source("http://www.zzlab.net/GAPIT/emma.txt") source("http://www.zzlab.net/GAPIT/gapit_functions.txt") # Import demo data myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T) # Simultate 10 QTN on the first half chromosomes source("GAPIT.simu2.R") mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,CV=myCV, h2=.5,NQTN=10,effectunit =.95, QTNDist="normal",cveff=c(.51,.51)) source("~/Dropbox/GAPIT/Functions/GAPIT.Main.R") # GWAS 1 ---- # In this section, we perform a basic GWAS using GLM method. # When the result is ready, we examine it, and take out the most important markers dir.create(file.path(main,"GLM")) GLM = paste(main, "GLM", sep="/") setwd(GLM) myGAPIT = GAPIT( Y=mySim$Y, GD=myGD, GM=myGM, PCA.total=3, CV=myCV, #CV.Inheritance=6, group.from=7, group.to=7, group.by=10, QTN.position=mySim$QTN.position, #SNP.test=FALSE, memo="GLM5" ) str(myGAPIT) # correlation between predicted Y and observed Y (simulated) ry2=cor(myGAPIT$Pred$Prediction,mySim$Y$V1)^2 # correlation between predicted Y ru2=cor(myGAPIT$Pred[,8],mySim$u)^2 par(mfrow=c(2,1), mar = c(3,4,1,1)) plot(myGAPIT$Pred[,8],mySim$Y[,2]) mtext(paste("R square=",ry2,sep=""), side = 3) plot(myGAPIT$Pred[,8],mySim$u) mtext(paste("R square=",ru2,sep=""), side = 3) # Adding the top associated SNPs # From the GWAS result above, the most significant QNTs are pulling out # and used as new covariates in next step ntop=20 index=order(myGAPIT$P) top=index[1:ntop] # Put the most significant predictors together # In next GWAS, we will use it as the covariates myQTN=cbind(myGAPIT$PCA[,1:4], myCV[,2:3],myGD[,c(top+1)]) # GWAS 2 with QTN ---- # With GWAS 1, we know what are the potential important markers, and we # saved it as a new dataframe along with the PCAs and CVs, i.e. the most # important predictors. # In this section, we do a GWAS again, and use "the most important predictors" # as the new covariates dir.create(file.path(main,"QTN")) QTN = paste(main, "QTN", sep="/") setwd(QTN) myGAPIT2 <- GAPIT( Y=mySim$Y, GD=myGD, GM=myGM, CV=myQTN, # the most important predictros we prepared group.from=1,group.to=1, group.by=10, QTN.position=mySim$QTN.position, SNP.test=FALSE, memo="GLM+QTN") ry2=cor(myGAPIT2$Pred[,8],mySim$Y[,2])^2 ru2=cor(myGAPIT2$Pred[,8],mySim$u)^2 par(mfrow=c(2,1), mar = c(3,4,1,1)) plot(myGAPIT2$Pred[,8],mySim$Y[,2]) mtext(paste("R square=",ry2,sep=""), side = 3) plot(myGAPIT2$Pred[,8],mySim$u) mtext(paste("R square=",ru2,sep=""), side = 3) # Cross Validation ---- # In this part, we perform similar analysis with the implementation # of cross validation dir.create(file.path(main,"CV")) CV = paste(main, "CV", sep="/") setwd(CV) # Training ---- # We will first split our data into a training set and test set, # run a GWAS with simple GLM (no QTN yet) set.seed(99164) n=nrow(mySim$Y) # randomly sample 140 numbers from 1-281, and we will use it for splitting dataset training=sample(n,round(n/2),replace=F) testing = -training # Now do GWAS: myGAPIT3 <- GAPIT( Y=mySim$Y[training,], # use only half of the data GD=myGD, GM=myGM, CV=myCV, PCA.total=3, group.from=1, group.to=1, group.by=10, QTN.position=mySim$QTN.position, #SNP.test=FALSE, memo="Training" ) # take a look at the result ry2=cor(myGAPIT3$Pred[,8],mySim$Y[,2])^2 ru2=cor(myGAPIT3$Pred[,8],mySim$u)^2 par(mfrow=c(2,1), mar = c(3,4,1,1)) plot(myGAPIT3$Pred[,8],mySim$Y[,2]) mtext(paste("R square=",ry2,sep=""), side = 3) plot(myGAPIT3$Pred[,8],mySim$u) mtext(paste("R square=",ru2,sep=""), side = 3) # From the above, we pulling out the most significant QTNs # and form the most important predictor dataframe: ntop=10 index=order(myGAPIT3$P) top=index[1:ntop] myQTN=cbind(myGAPIT3$PCA[,1:4], myCV[,2:3],myGD[,c(top+1)]) # A GWAS with the QTNs we found in GWAS3, similar to GWAS2, but only on training set myGAPIT4 <- GAPIT( Y=mySim$Y[training,], GD=myGD, GM=myGM, CV=myQTN, group.from=1, group.to=1, group.by=1, QTN.position=mySim$QTN.position, SNP.test=FALSE, memo="GLM+QTN" ) # Now GWAS4 is a model that suppose to explain the relationship between # geno and pheno. We don't know how well it will perform yet. # We will first check the result visually, and then evaluate it on # our testing set. # visually exam the result of GWAS 4 ry2=cor(myGAPIT4$Pred[training,8],mySim$Y[training,2])^2 ru2=cor(myGAPIT4$Pred[training,8],mySim$u[training])^2 par(mfrow=c(2,1), mar = c(3,4,1,1)) plot(myGAPIT4$Pred[training,8],mySim$Y[training,2]) mtext(paste("R square=",ry2,sep=""), side = 3) plot(myGAPIT4$Pred[training,8],mySim$u[training]) mtext(paste("R square=",ru2,sep=""), side = 3) # GWAS 4: kinship matrix quartz() heatmap(cor(myQTN[,-1])) # Testing ---- # We saved some of our data as test set. # We can use the GWAS4 model to predict phenotype of the test set and we can compare the # predicted pheno and the observed pheno (simulated). # put the effects of the most significant predictors together effect=rbind(myGAPIT3$effect.cv, as.matrix(myGAPIT3$effect.snp[top])) # Put the most significatn predictors together X=as.matrix(cbind(1, myCV[,2:3],myGAPIT3$PCA[,-1],myGD[,c(top+1)])) # Predicted phenotype Pred=X%*%effect ry2=cor(Pred[testing],mySim$Y[testing,2])^2 ru2=cor(Pred[testing],mySim$u[testing])^2 par(mfrow=c(2,1), mar = c(3,4,1,1)) plot(Pred[testing],mySim$Y[testing,2]) mtext(paste("R square=",ry2,sep=""), side = 3) plot(Pred[testing],mySim$u[testing]) mtext(paste("R square=",ru2,sep=""), side = 3)