# CROPS545_Lecture23_CV
# Zhiwu Zhang April  3, 2016
# Yuanhong Song modified and commented 4/18/2018

# Set up ----
rm(list=ls())
main="/Users/song/Dropbox/COURSE/CROP_545/2018/labs/lab14"
setwd(main)
# # Import GAPIT
# source("http://www.bioconductor.org/biocLite.R")
# biocLite("multtest")
# install.packages("gplots")
# install.packages("scatterplot3d")#The downloaded link at: http://cran.r-project.org/package=scatterplot3d

library('MASS') # required for ginv
library(multtest)
library(gplots)
library(compiler) #required for cmpfun
library("scatterplot3d")
source("http://www.zzlab.net/GAPIT/emma.txt")
source("http://www.zzlab.net/GAPIT/gapit_functions.txt")

# Import demo data
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)

# Simultate 10 QTN on the first half chromosomes
source("GAPIT.simu2.R")
mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,CV=myCV,
                                 h2=.5,NQTN=10,effectunit =.95,
                                 QTNDist="normal",cveff=c(.51,.51))
source("~/Dropbox/GAPIT/Functions/GAPIT.Main.R")

# GWAS 1 ----
# In this section, we perform a basic GWAS using GLM method.
# When the result is ready, we examine it, and take out the most important markers
dir.create(file.path(main,"GLM"))
GLM = paste(main, "GLM", sep="/")
setwd(GLM)

myGAPIT = GAPIT(
  Y=mySim$Y,
  GD=myGD,
  GM=myGM,
  PCA.total=3,
  CV=myCV,
  #CV.Inheritance=6,
  group.from=7,
  group.to=7,
  group.by=10,
  QTN.position=mySim$QTN.position,
  #SNP.test=FALSE,
  memo="GLM5"
)
str(myGAPIT)
# correlation between predicted Y and observed Y (simulated)
ry2=cor(myGAPIT$Pred$Prediction,mySim$Y$V1)^2 

# correlation between predicted Y
ru2=cor(myGAPIT$Pred[,8],mySim$u)^2

par(mfrow=c(2,1), mar = c(3,4,1,1))
  plot(myGAPIT$Pred[,8],mySim$Y[,2])
  mtext(paste("R square=",ry2,sep=""), side = 3)
  plot(myGAPIT$Pred[,8],mySim$u)
  mtext(paste("R square=",ru2,sep=""), side = 3)

# Adding the top associated SNPs
# From the GWAS result above, the most significant QNTs are pulling out 
# and used as new covariates in next step
ntop=20
index=order(myGAPIT$P)
top=index[1:ntop]
# Put the most significant predictors together
# In next GWAS, we will use it as the covariates
myQTN=cbind(myGAPIT$PCA[,1:4], myCV[,2:3],myGD[,c(top+1)])


# GWAS 2 with QTN ----
# With GWAS 1, we know what are the potential important markers, and we
# saved it as a new dataframe along with the PCAs and CVs, i.e. the most
# important predictors. 
# In this section, we do a GWAS again, and use "the most important predictors"
# as the new covariates
dir.create(file.path(main,"QTN"))
QTN = paste(main, "QTN", sep="/")
setwd(QTN)

myGAPIT2 <- GAPIT(
  Y=mySim$Y, GD=myGD, GM=myGM,
  CV=myQTN, # the most important predictros we prepared 
  group.from=1,group.to=1, group.by=10,
  QTN.position=mySim$QTN.position,
  SNP.test=FALSE,
  memo="GLM+QTN")

ry2=cor(myGAPIT2$Pred[,8],mySim$Y[,2])^2
ru2=cor(myGAPIT2$Pred[,8],mySim$u)^2

par(mfrow=c(2,1), mar = c(3,4,1,1))
  plot(myGAPIT2$Pred[,8],mySim$Y[,2])
  mtext(paste("R square=",ry2,sep=""), side = 3)
  plot(myGAPIT2$Pred[,8],mySim$u)
  mtext(paste("R square=",ru2,sep=""), side = 3)

# Cross Validation ----
# In this part, we perform similar analysis with the implementation
# of cross validation
dir.create(file.path(main,"CV"))
CV = paste(main, "CV", sep="/")
setwd(CV)

# Training ----
# We will first split our data into a training set and test set,
# run a GWAS with simple GLM (no QTN yet)
set.seed(99164)
n=nrow(mySim$Y)
# randomly sample 140 numbers from 1-281, and we will use it for splitting dataset
training=sample(n,round(n/2),replace=F)
testing = -training
# Now do GWAS:
myGAPIT3 <- GAPIT(
  Y=mySim$Y[training,], # use only half of the data
  GD=myGD,
  GM=myGM,
  CV=myCV,
  PCA.total=3,
  group.from=1,
  group.to=1,
  group.by=10,
  QTN.position=mySim$QTN.position,
  #SNP.test=FALSE,
  memo="Training"
)
# take a look at the result
ry2=cor(myGAPIT3$Pred[,8],mySim$Y[,2])^2
ru2=cor(myGAPIT3$Pred[,8],mySim$u)^2
par(mfrow=c(2,1), mar = c(3,4,1,1))
  plot(myGAPIT3$Pred[,8],mySim$Y[,2])
  mtext(paste("R square=",ry2,sep=""), side = 3)
  plot(myGAPIT3$Pred[,8],mySim$u)
  mtext(paste("R square=",ru2,sep=""), side = 3)

# From the above, we pulling out the most significant QTNs
# and form the most important predictor dataframe:
ntop=10
index=order(myGAPIT3$P)
top=index[1:ntop]
myQTN=cbind(myGAPIT3$PCA[,1:4], myCV[,2:3],myGD[,c(top+1)])

# A GWAS with the QTNs we found in GWAS3, similar to GWAS2, but only on training set
myGAPIT4 <- GAPIT(
  Y=mySim$Y[training,],
  GD=myGD,
  GM=myGM,
  CV=myQTN,
  group.from=1,
  group.to=1,
  group.by=1,
  QTN.position=mySim$QTN.position,
  SNP.test=FALSE,
  memo="GLM+QTN"
)

# Now GWAS4 is a model that suppose to explain the relationship between
# geno and pheno. We don't know how well it will perform yet.
# We will first check the result visually, and then evaluate it on 
# our testing set.

# visually exam the result of GWAS 4
ry2=cor(myGAPIT4$Pred[training,8],mySim$Y[training,2])^2
ru2=cor(myGAPIT4$Pred[training,8],mySim$u[training])^2
par(mfrow=c(2,1), mar = c(3,4,1,1))
  plot(myGAPIT4$Pred[training,8],mySim$Y[training,2])
  mtext(paste("R square=",ry2,sep=""), side = 3)
  plot(myGAPIT4$Pred[training,8],mySim$u[training])
  mtext(paste("R square=",ru2,sep=""), side = 3)

# GWAS 4: kinship matrix
quartz()
heatmap(cor(myQTN[,-1]))
  

# Testing ----
# We saved some of our data as test set. 
# We can use the GWAS4 model to predict phenotype of the test set and we can compare the 
# predicted pheno and the observed pheno (simulated).

# put the effects of the most significant predictors together
effect=rbind(myGAPIT3$effect.cv, as.matrix(myGAPIT3$effect.snp[top]))
# Put the most significatn predictors together
X=as.matrix(cbind(1, myCV[,2:3],myGAPIT3$PCA[,-1],myGD[,c(top+1)]))
# Predicted phenotype
Pred=X%*%effect

ry2=cor(Pred[testing],mySim$Y[testing,2])^2
ru2=cor(Pred[testing],mySim$u[testing])^2
par(mfrow=c(2,1), mar = c(3,4,1,1))
  plot(Pred[testing],mySim$Y[testing,2])
  mtext(paste("R square=",ry2,sep=""), side = 3)
  plot(Pred[testing],mySim$u[testing])
  mtext(paste("R square=",ru2,sep=""), side = 3)