###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###         CROPS 545          ###
  ###         Homework 6         ###
  ###         Evan Craine        ###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###

#prepare the work space
rm(list=ls())

#load the necessary packages

  library('MASS')
  library(multtest)
  library(gplots)
  library(compiler)
  library("scatterplot3d")
  library(bigmemory)
  library(biganalytics)
  library("EMMREML")
  library(rrBLUP)

#load the necessary functions

  source("http://www.zzlab.net/GAPIT/emma.txt")
  source("http://www.zzlab.net/GAPIT/gapit_functions.txt")
  source("GAPIT.simu2.R.txt")

  
#set the working directory, check it
  setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6")
  getwd()
  dir()
  
#read in the necessary files
  myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
  myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
  myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)
 
#view each to familiarize with the data table contents and format
  View(myGD) #columns: taxa (281 individs) and 3093 SNPs
  View(myGM) #SNP, Chromosome, Position
  View(myCV) #Taxa, Factor A, Factor B
  
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###         Problem 1          ###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  
  set.seed(99163)

    mySim=GAPIT.Phenotype.Simulation(
    GD=myGD,
    GM=myGM,
    h2=.5,
    NQTN=20,
    #effectunit = .95,
    QTNDist="normal",
    cveff=c(.51,.51),
    CV=myCV
  )

#including cveff drastically changes the scatter plot
  
  effects <- mySim$u #set effect values equal to u from mySim
  myY   <- mySim$Y #set phenotype values equal to Y from mySim 
  r2=round(cor(effects,myY[,2])^2,3) #calcualte r2 and round to third decimal place

#prepare the window area for the plot
  par(mfrow=c(1,1), mar = c(4.5,4.5,1.7,1)) 
  
#compare genetic effects and phenotype by plotting   
  plot(effects,myY[,2], col="royalblue",ylab="Phenotypes",xlab="Genetic Effects",)
  legend("topleft", paste("R square=",r2,sep="")) #include the r squared value on the top left of the grapho
  
#plot the distribution of QTN effects 
  hist(mySim$effect, xlab="QTN effect",main="QTN Effect Distribution",col="royalblue")

  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###         Problem 2          ###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  
#Perform GWAS on the simulated phenotypes with all the individuals by using FarmCPU and selected the top 20 associated SNPs. 
#clear the workspace  
  rm(list=ls())
  
#set the wd
  main="~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6"
  setwd(main)
  getwd()
  dir()
  
#read in the necessary files
  myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
  myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
  myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)
  
#load the required functions  
  source("GAPIT.simu2.R.txt")
  source("FarmCPU_functions.txt") #save as .txt from ZZlab if you don't already have in WD
  source("http://www.zzlab.net/GAPIT/emma.txt")
  source("http://www.zzlab.net/GAPIT/gapit_functions.txt")
  source("GAPIT.simu2.R.txt")
  

#perform phenotype simulation   
  set.seed(99163)

  mySim=GAPIT.Phenotype.Simulation(
    GD=myGD,
    GM=myGM,
    h2=0.50,
    NQTN=20,
    QTNDist = "normal")
  
#store the phenotype  
  myY <- mySim$Y #taxa and phenotype value

#perform GWAS using GAPIT to generate PCA
  myGAPIT0 <- GAPIT(
    GD=myGD,
    GM=myGM,
    PCA.total=3,
    file.output = FALSE)

#save the PCA  
  myPC = as.matrix(myGAPIT0$PCA[,-1]) #output is taxa, PC1, PC2, and PC3
  
#performGWAS using FarmCPU
  myFarmCPU = FarmCPU(
    Y=mySim$Y,
    GD=myGD,
    GM=myGM,
    CV=myPC
  )
  

#save the pertinent info from GWAS analysis
  ntop=20
  index=order(myFarmCPU$GWAS$P.value)
  top=index[1:ntop]
  myQTN1=cbind(myGD[,1],myGAPIT0$PCA[,-1], myGD[,c(top+1)]) #has taxa


#perform the replications
  Prediction=replicate(30,{
    #Selects 140 random individuals designed by row # to be testing, rest training
    #One individual is dropped from training to make the numbers equal for correlation later
    n=nrow(myY)
    testing=sample(n,round(n/2),replace=F) #half selected for testing pop
    training.initial=setdiff(seq(1:n),testing) #half selected for training pop
    remove.row=sample(length(training.initial),1)
    training=training.initial[-remove.row] #gets the populations down to 140 individuals each
    #Runs GAPIT and assesses prediction accuracy
    myGAPIT <- GAPIT(
      Y=mySim$Y[training,],
      GD=myGD,
      GM=myGM,
      CV=myQTN1, #info from above
      group.from=1,
      group.to=1,
      group.by=1,
      SNP.test=FALSE,
      memo="GLM+QTN")
    #Calculates the effect from the PCA + 20 top SNPs
    effect=myGAPIT$effect.cv
    #Predicts phenotypes by multiplying genotypes of 20 SNPs by estimated effects
    effect.matrix=as.matrix(cbind(1,myQTN1[,-1]))
    Pred=effect.matrix%*%effect
    #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population
    ry2=cor(Pred[testing],mySim$Y[testing,2])^2
    ru2=cor(Pred[testing],mySim$u[testing])^2
    return(list(phen = ry2, BV=ru2))
  })
  
  pheno.mean=mean(as.numeric(Prediction[1,])) 
  pheno.sd=sd(as.numeric(Prediction[1,]))  
  BV.mean=mean(as.numeric(Prediction[2,])) 
  BV.sd=sd(as.numeric(Prediction[2,])) 
  
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###         Problem 3          ###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###  
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
#Perform GWAS on the simulated phenotypes with all the individuals by using FarmCPU and selected the top 20 associated SNPs. 
#clear the workspace  
  rm(list=ls())

#set the wd
  main="~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6"
  setwd(main)
  getwd()
  dir()

#read in the necessary files
  myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
  myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
  myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)

#load the required functions  
  source("GAPIT.simu2.R.txt")
  source("FarmCPU_functions.txt") #save as .txt from ZZlab if you don't already have in WD
  source("http://www.zzlab.net/GAPIT/emma.txt")
  source("http://www.zzlab.net/GAPIT/gapit_functions.txt")
  source("GAPIT.simu2.R.txt")


#perform phenotype simulation   
  set.seed(99163)
  
  mySim=GAPIT.Phenotype.Simulation(
    GD=myGD,
    GM=myGM,
    h2=0.50,
    NQTN=20,
    QTNDist = "normal")

#store the phenotype  
  myY <- mySim$Y #taxa and phenotype value

#shuffle the phenotypes
  rows=sample(length(myY[,1]),replace=FALSE) #randomly choose individuals (as row numbers) from 
  myShuffle=data.frame(v1=myGD[,1],v2=myY[rows,2]) #create a new data frame, taxon doesn't match phenotpye

#perform GWAS using GAPIT to generate PCA
  myGAPIT0 <- GAPIT(
  GD=myGD,
  GM=myGM,
  PCA.total=3,
  file.output = FALSE)

#save the PCA  
  myPC = as.matrix(myGAPIT0$PCA[,-1]) #output is taxa, PC1, PC2, and PC3

#performGWAS using FarmCPU
  myFarmCPU = FarmCPU(
    Y=myShuffle,
    GD=myGD,
    GM=myGM,
    CV=myPC
  )


#save the pertinent info from GWAS analysis
  ntop=20
  index=order(myFarmCPU$GWAS$P.value)
  top=index[1:ntop]
  myQTN1=cbind(myGD[,1],myGAPIT0$PCA[,-1], myGD[,c(top+1)]) #has taxa


#perform the replications
  Prediction=replicate(30,{
    #Selects 140 random individuals designed by row # to be testing, rest training
    #One individual is dropped from training to make the numbers equal for correlation later
    n=nrow(myY)
    testing=sample(n,round(n/2),replace=F) #half selected for testing pop
    training.initial=setdiff(seq(1:n),testing) #half selected for training pop
    remove.row=sample(length(training.initial),1)
    training=training.initial[-remove.row] #gets the populations down to 140 individuals each
    #Runs GAPIT and assesses prediction accuracy
    myGAPIT <- GAPIT(
      Y=myShuffle[training,],
      GD=myGD,
      GM=myGM,
      CV=myQTN1, #info from above
      group.from=1,
      group.to=1,
      group.by=1,
      SNP.test=FALSE,
      memo="GLM+QTN")
    
    effect=myGAPIT$effect.cv #effect calculated from PCA and top 20 QTN
    
    effect.matrix=as.matrix(cbind(1,myQTN1[,-1]))#phenotypes predicted using genotypic data from top 20 QTN
    Pred=effect.matrix%*%effect
    #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population
    ry2=cor(Pred[testing],mySim$Y[testing,2])^2
    ru2=cor(Pred[testing],mySim$u[testing])^2
    return(list(phen = ry2, BV=ru2))
})

#calculate the mean and standard deviation values for each prediction
  pheno.mean=mean(as.numeric(Prediction[1,])) 
  pheno.sd=sd(as.numeric(Prediction[1,]))  
  BV.mean=mean(as.numeric(Prediction[2,])) 
  BV.sd=sd(as.numeric(Prediction[2,])) 

  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###         Problem 3          ###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###  
  
#clear the workspace  
  rm(list=ls())
  
#set the wd
  main="~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6"
  setwd(main)
  getwd()
  dir()
  
#read in the necessary files
  myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
  myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
  myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)
  
#load the necessary packages
  library('MASS')
  library(multtest)
  library(gplots)
  library(compiler)
  library("scatterplot3d")
  library(bigmemory)
  library(biganalytics)
  library("EMMREML")
  library(rrBLUP)
  
#load the required functions  
  source("GAPIT.simu2.R.txt")
  source("FarmCPU_functions.txt") #save as .txt from ZZlab if you don't already have in WD
  source("http://www.zzlab.net/GAPIT/emma.txt")
  source("http://www.zzlab.net/GAPIT/gapit_functions.txt")
  source("GAPIT.simu2.R.txt")
  
#simulate phenotypes with h2 = 50%, 20 QTN normally distributed
mySim = GAPIT.Phenotype.Simulation(
  GD=myGD,
  GM=myGM,
  h2=0.50,
  NQTN=20,
  QTNDist = "normal"
)

#store the phenotype  
myY <- mySim$Y #taxa and phenotype value

#shuffle the phenotypes
rows=sample(length(myY[,1]),replace=FALSE) #randomly choose individuals (as row numbers) from 
myShuffle=data.frame(v1=myGD[,1],v2=myY[rows,2]) #create a new data frame, taxon doesn't match phenotpye


#perform GBLUP function thirty times
  gBLUP=replicate(30,{
    #first divide the individuals
    n=nrow(myY)
    testing=sample(n,round(n/5),replace=F) #randomly selecting 20 individuals for the testing population
    training=setdiff(seq(1:n),testing) #set the rest of the individuals as the training population

    #now calculate gBLUP
    mygBLUP = GAPIT(
      y = mySim$Y[training,],
      GD = myGD,
      GM = myGM,
      PCA.total = 3,
      group.from = 1000,
      group.to = 1000,
      group.by = 10,
      SNP.test = FALSE,
      memo = "gBLUP"
    )
    
    #now calculate the correlations in the training and testing pops
    ry2.blup.train = cor(mygBLUP$Pred[training,5],mySim$Y[training,2])^2
    ru2.blup.train = cor(mygBLUP$Pred[training,5],mySim$u[training])^2
    ry2.blup.test = cor(mygBLUP$Pred[testing,5],mySim$Y[testing,2])^2
    ru2.blup.test = cor(mygBLUP$Pred[testing,5],mySim$u[testing])^2
    return(list(phen.train = ry2.blup.train, BV.train = ru2.blup.train, phen.test = ry2.blup.test, BV.test = ru2.blup.test))
  })

#now calculate the averages and the standard deviations
  phen.mean.train=mean(as.numeric(gBLUP[1,]))
  phen.sd.train=sd(as.numeric(gBLUP[1,]))
  BV.mean.train=mean(as.numeric(gBLUP[2,]))
  BV.sd.train=sd(as.numeric(gBLUP[2,]))
  phen.mean.test=mean(as.numeric(gBLUP[3,]))
  phen.sd.test=sd(as.numeric(gBLUP[3,]))
  BV.mean.test=mean(as.numeric(gBLUP[4,]))
  BV.sd.test=sd(as.numeric(gBLUP[4,]))
  
  
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###         Problem 4          ###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###  
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  
  #clear the workspace  
  rm(list=ls())
  
  #set the wd
  main="~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6"
  setwd(main)
  getwd()
  dir()
  
  #read in the necessary files
  myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
  myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
  myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)
  
  #load the necessary packages
  library('MASS')
  library(multtest)
  library(gplots)
  library(compiler)
  library("scatterplot3d")
  library(bigmemory)
  library(biganalytics)
  library("EMMREML")
  library(rrBLUP)
  
  #load the required functions  
  source("GAPIT.simu2.R.txt")
  source("FarmCPU_functions.txt") #save as .txt from ZZlab if you don't already have in WD
  source("http://www.zzlab.net/GAPIT/emma.txt")
  source("http://www.zzlab.net/GAPIT/gapit_functions.txt")
  source("GAPIT.simu2.R.txt")
  
  #simulate phenotypes with h2 = 50%, 20 QTN normally distributed
  mySim = GAPIT.Phenotype.Simulation(
    GD=myGD,
    GM=myGM,
    h2=0.50,
    NQTN=20,
    QTNDist = "normal"
  )
  
  #store the phenotype  
  myY <- mySim$Y #taxa and phenotype value
  
  #perform GBLUP function thirty times
  gBLUP=replicate(30,{
    #first divide the individuals
    n=nrow(myY)
    testing=sample(n,round(n/5),replace=F) #randomly selecting 20 individuals for the testing population
    training=setdiff(seq(1:n),testing) #set the rest of the individuals as the training population
    
    #now calculate gBLUP
    mygBLUP = GAPIT(
      y = mySim$Y[training,],
      GD = myGD,
      GM = myGM,
      PCA.total = 3,
      group.from = 1000,
      group.to = 1000,
      group.by = 10,
      SNP.test = FALSE,
      memo = "gBLUP"
    )
    
    #now calculate the correlations in the training and testing pops
    ry2.blup.train = cor(mygBLUP$Pred[training,5],mySim$Y[training,2])^2
    ru2.blup.train = cor(mygBLUP$Pred[training,5],mySim$u[training])^2
    ry2.blup.test = cor(mygBLUP$Pred[testing,5],mySim$Y[testing,2])^2
    ru2.blup.test = cor(mygBLUP$Pred[testing,5],mySim$u[testing])^2
    return(list(phen.train = ry2.blup.train, BV.train = ru2.blup.train, phen.test = ry2.blup.test, BV.test = ru2.blup.test))
  })
  
  #now calculate the averages and the standard deviations
  phen.mean.train=mean(as.numeric(gBLUP[1,]))
  phen.sd.train=sd(as.numeric(gBLUP[1,]))
  BV.mean.train=mean(as.numeric(gBLUP[2,]))
  BV.sd.train=sd(as.numeric(gBLUP[2,]))
  phen.mean.test=mean(as.numeric(gBLUP[3,]))
  phen.sd.test=sd(as.numeric(gBLUP[3,]))
  BV.mean.test=mean(as.numeric(gBLUP[4,]))
  BV.sd.test=sd(as.numeric(gBLUP[4,]))
  
  
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~###
  ###         Problem 5          ###
  ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### 
  #clear the workspace  
  rm(list=ls())
  
  #set the wd
  main="~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6"
  setwd(main)
  getwd()
  dir()
  
  #read in the necessary files
  myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
  myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
  myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)
  
  #install and load the rrBLUP package
  install.packages("rrBLUP") 
  library(rrBLUP)
  
  #load the necessary packages
  library('MASS')
  library(multtest)
  library(gplots)
  library(compiler)
  library("scatterplot3d")
  library(bigmemory)
  library(biganalytics)
  library("EMMREML")
  library(rrBLUP)
  
  #load the required functions  
  source("GAPIT.simu2.R.txt")
  source("FarmCPU_functions.txt") #save as .txt from ZZlab if you don't already have in WD
  source("http://www.zzlab.net/GAPIT/emma.txt")
  source("http://www.zzlab.net/GAPIT/gapit_functions.txt")
  source("GAPIT.simu2.R.txt")
  
#simulate phenotypes with h2 = 50%, 20 QTN normally distributed
  set.seed(99164)
  mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.5,NQTN=20, effectunit =.95,QTNDist="normal",CV=myCV,cveff=c(.51,.51))
  myY   <- mySim$Y
  
#prepare for the task
  n=nrow(myY)
  M=as.matrix(myGD[,-1]) #this is teh genotypic data for all individuals at all markers
  
#perform GAPIT to get the PCA
  myGAPIT=GAPIT(GD=myGD,GM=myGM, PCA.total=3,file.output=FALSE)
  pcEnv=cbind(myGAPIT$PCA[,-1],myCV[,-1])
  
  
#prepare for five fold cross validation task
  
  nrep=30 #declare the number of reps
  nfold=5 #declare the number of folds
  
#prepare null vectors to output the results (accuracies) into  
  cor_pheno <- NULL
  cor_gebv  <- NULL
  
  for (rep in 1:nrep){
    c.pheno <- NULL
    c.gebv <- NULL
    #setup fold
    myCut=cut(1:n,nfold,labels=FALSE) #create the folds
    fold=sample(myCut,n) #randomly sample from the folds
    for (f in 1:nfold){   
      print(c(rep,f))
      testing=(fold==f)
      training=!testing
  #use rrBLUP package to perform the genomic selection task
      ans.RR <- mixed.solve(y=myY[training,2],Z=M[training,],X=pcEnv[training,])
      fix_effect <- as.matrix(pcEnv) %*% ans.RR$beta
      ran_effect <- M %*% ans.RR$u
      pred_pheno <- fix_effect + ran_effect
      
      Pred_v_Pheno <- cor(pred_pheno[testing,],mySim$Y[testing,2])
      Pred_v_GEBV <- cor(ran_effect[testing,],mySim$u[testing])
      
      c.pheno <- c(c.pheno,Pred_v_Pheno)
      c.gebv <- c(c.gebv, Pred_v_GEBV)
      
    }
    final_pheno <- sum(c.pheno)/nfold #get out the final phenotypic values
    final_gebv  <- sum(c.gebv)/nfold #get out the final genomic estimated breeding values 
    
    cor_pheno <- c(cor_pheno,final_pheno) #calculate the accuracy between the actual phenotype and the predicted phenotype
    cor_gebv  <- c(cor_gebv, final_gebv) #calculate the accuracy between the actual GEBV and the predicted GEBV
    
  }#rep
  

#calculate the summary statistics
  m1 <- round(mean(cor_pheno),3)
  m1 #0.784
  
  m2 <- round(mean(cor_gebv),3)
  m2 #0.434
  
  s1 <- round(sd(cor_pheno),3)
  s1 #0.005
  
  s2 <- round(sd(cor_gebv),3)
  s2 #0.019