# Homework assignment 6, CROPS545 Statistical Genomics by Ryan Oliveira

setwd("~/R/RyanDirectory/Homework6")

#Installs necessary packages
library(mgcv)
source("http://www.bioconductor.org/biocLite.R")
biocLite("multtest")
install.packages("gplots")
install.packages("LDheatmap")
install.packages("genetics")
install.packages("EMMREML")
install.packages("scatterplot3d")

#Imports the installed packages
library('MASS') # required for ginv
library(multtest)
library(gplots)
library(LDheatmap)
library(genetics)
library(EMMREML)
library(compiler) #this library is already installed in R
library("scatterplot3d")

# Imports data, source code for exercise
source("http://zzlab.net/GAPIT/gapit_functions.txt")
source("http://zzlab.net/GAPIT/emma.txt")
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
myG=read.table(file="http://zzlab.net/GAPIT/data/mdp_genotype_test.hmp.txt",head=T)
source("http://zzlab.net/StaGen/2017/R/G2P.R")
X=myGD[,-1]

#--------------Part 1: Simulate 20 QTNs---------------#

#Simulates 20 QTNs with normally distributed effects
set.seed=78521 #The ZIP code for Brownsville, TX, which is much warmer this time of year
mySim = GAPIT.Phenotype.Simulation(
  GD=myGD,
  GM=myGM,
  h2=0.50,
  NQTN=20,
  QTNDist="normal"
  )

#Plots the distribution of QTN effects
plot(mySim$QTN.position,mySim$effect, xlab="QTN position", ylab="QTN effect")
hist(mySim$effect,xlab="QTN effect",main="QTN effect distribution")

#Plots genetic effect (u) against phenotype (Y) and displays correlation
ru=cor(mySim$u,mySim$Y$V1)
plot(mySim$u,mySim$Y$V1, xlab="Breeding value", ylab="Phenotype")
mtext(paste("R square=",ru,sep=""),side=3)

#--------Part 2: Marker selection and cross validation--------#

#Source the FarmCPU code
source("http://www.zzlab.net/FarmCPU/FarmCPU_functions.txt")

#Installs necessary packages
install.packages("bigmemory")
install.packages("biganalytics")

#Libraries packages
library("bigmemory")
library("biganalytics")

#To get PC via GAPIT
myGAPIT0=GAPIT(GD=myGD,GM=myGM,PCA.total=4)
myPC=as.matrix(myGAPIT0$PCA[,-1])

#Makes a Y table to fit FarmCPU
myY=data.frame(v1=myGD[,1],v2=mySim$Y$V1)

#Runs FarmCPU
myGWAS = FarmCPU(
  Y=myY,
  GD=myGD,
  GM=myGM,
  CV=myPC
)

#Choose the top twenty SNPs
ntop=20
index=order(myGWAS$GWAS$P.value)
top=index[1:ntop]
myQTN=cbind(myGD[,1],myGAPIT0$PCA[,-1], myGD[,c(top+1)])

#Loops a cross-validation function thirty times
Prediction=replicate(30,{
#Selects 140 random individuals designed by row # to be testing, rest training
#One individual is dropped from training to make the numbers equal for correlation later
n=nrow(myY)
testing=sample(n,round(n/2),replace=F)
training.initial=setdiff(seq(1:n),testing)
remove.row=sample(length(training.initial),1)
training=training.initial[-remove.row]
#Runs GAPIT and assesses prediction accuracy
myGAPIT <- GAPIT(
  Y=mySim$Y[training,],
  GD=myGD,
  GM=myGM,
  CV=myQTN,
  group.from=1,
  group.to=1,
  group.by=1,
  SNP.test=FALSE,
  memo="GLM+QTN")
#Calculates the effect from the PCA + 20 top SNPs
effect=myGAPIT$effect.cv
#Predicts phenotypes by multiplying genotypes of 20 SNPs by estimated effects
effect.matrix=as.matrix(cbind(1,myQTN[,-1]))
Pred=effect.matrix%*%effect
#Correlates the predicted phenotypes against breeding value and actual phenotype in testing population
ry2=cor(Pred[testing],mySim$Y[testing,2])^2
ru2=cor(Pred[testing],mySim$u[testing])^2
return(list(phen = ry2, BV=ru2))
})

#Takes averages
phen.mean=mean(as.numeric(Prediction[1,]))
phen.sd=sd(as.numeric(Prediction[1,]))
BV.mean=mean(as.numeric(Prediction[2,]))
BV.sd=sd(as.numeric(Prediction[2,]))

#--------Part 3: MAS/CV with random phenotypic shuffling--------#

#Source the FarmCPU code
source("http://www.zzlab.net/FarmCPU/FarmCPU_functions.txt")

#Installs necessary packages
install.packages("bigmemory")
install.packages("biganalytics")

#Libraries packages
library("bigmemory")
library("biganalytics")

#Simulates 20 QTNs with normally distributed effects
set.seed=78521 #The ZIP code for Brownsville, TX, which is much warmer this time of year
mySim = GAPIT.Phenotype.Simulation(
  GD=myGD,
  GM=myGM,
  h2=0.50,
  NQTN=20,
  QTNDist="normal"
)

#Makes a Y table to fit FarmCPU
myY=data.frame(v1=myGD[,1],v2=mySim$Y$V1)

#Randomly re-samples rows from the phenotype set
rows=sample(length(myY[,1]),replace=FALSE)
#Creates a shuffled phenotype data frame where the phenotypes do not match the QTN effects
myShuffle=data.frame(v1=myGD[,1],v2=myY[rows,2])

#To get PC via GAPIT
myGAPIT0=GAPIT(GD=myGD,GM=myGM,PCA.total=4)
myPC=as.matrix(myGAPIT0$PCA[,-1])
  
#Runs FarmCPU
myGWAS = FarmCPU(
  Y=myShuffle,
  GD=myGD,
  GM=myGM,
  CV=myPC
)

#Choose the top twenty SNPs
ntop=20
index=order(myGWAS$GWAS$P.value)
top=index[1:ntop]
myQTN=cbind(myGD[,1],myGAPIT0$PCA[,-1], myGD[,c(top+1)])

#Loops a cross-validation function thirty times
Prediction=replicate(30,{
  #Selects 140 random individuals designed by row # to be testing, rest training
  #One individual is dropped from training to make the numbers equal for correlation later
  n=nrow(myShuffle)
  testing=sample(n,round(n/2),replace=F)
  training.initial=setdiff(seq(1:n),testing)
  remove.row=sample(length(training.initial),1)
  training=training.initial[-remove.row]
  #Runs GAPIT and assesses prediction accuracy
  myGAPIT <- GAPIT(
    Y=myShuffle[training,],
    GD=myGD,
    GM=myGM,
    CV=myQTN,
    group.from=1,
    group.to=1,
    group.by=1,
    SNP.test=FALSE,
    memo="GLM+QTN")
  #Calculates the effect from the PCA + 20 top SNPs
  effect=myGAPIT$effect.cv
  #Predicts phenotypes by multiplying genotypes of 20 SNPs by estimated effects
  effect.matrix=as.matrix(cbind(1,myQTN[,-1]))
  Pred=effect.matrix%*%effect
  #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population
  ry2=cor(Pred[testing],myShuffle[testing,2])^2
  ru2=cor(Pred[testing],mySim$u[testing])^2
  return(list(phen = ry2, BV=ru2))
})

#Takes averages
phen.mean=mean(as.numeric(Prediction[1,]))
phen.sd=sd(as.numeric(Prediction[1,]))
BV.mean=mean(as.numeric(Prediction[2,]))
BV.sd=sd(as.numeric(Prediction[2,]))

#--------Part 4: gBLUP in training vs. testing population--------#

#Simulates 20 QTNs with normally distributed effects
set.seed=96701 #The ZIP code for Oahu, Hawaii
mySim = GAPIT.Phenotype.Simulation(
  GD=myGD,
  GM=myGM,
  h2=0.50,
  NQTN=20,
  QTNDist="normal"
)

#Loops a gBLUP function thirty times
gBLUP=replicate(30,{
#Selects 20% random individuals designed by row # to be testing, rest training
n=nrow(myShuffle)
testing=sample(n,round(n/5),replace=F)
training=setdiff(seq(1:n),testing)
#Runs gBLUP
mygBLUP=GAPIT(
  Y=mySim$Y[training,],
  GD=myGD,
  GM=myGM,
  PCA.total=3,
  group.from=1000,
  group.to=1000,
  group.by=10,
  SNP.test=FALSE,
  memo="gBLUP"
)
#Performs correlation
ry2.blup.train=cor(mygBLUP$Pred[training,5],mySim$Y[training,2])^2
ru2.blup.train=cor(mygBLUP$Pred[training,5],mySim$u[training])^2
ry2.blup.test=cor(mygBLUP$Pred[testing,5],mySim$Y[testing,2])^2
ru2.blup.test=cor(mygBLUP$Pred[testing,5],mySim$u[testing])^2
return(list(phen.train = ry2.blup.train, BV.train = ru2.blup.train, phen.test = ry2.blup.test, BV.test = ru2.blup.test))
})

#Takes averages
phen.mean.train=mean(as.numeric(gBLUP[1,]))
phen.sd.train=sd(as.numeric(gBLUP[1,]))
BV.mean.train=mean(as.numeric(gBLUP[2,]))
BV.sd.train=sd(as.numeric(gBLUP[2,]))
phen.mean.test=mean(as.numeric(gBLUP[3,]))
phen.sd.test=sd(as.numeric(gBLUP[3,]))
BV.mean.test=mean(as.numeric(gBLUP[4,]))
BV.sd.test=sd(as.numeric(gBLUP[4,]))

#--------Part 5: 5-fold CV and rrBLUP--------#

#Import rrBLUP
install.packages("rrBLUP") 
library(rrBLUP)

#prepare data
y = mySim$Y[,2]
M = as.matrix(X)

#Sets the function to repeat 30 times
set.seed(02145) #ZIP code for Somerville, MA
nrep=30
rrBLUP.Q5=replicate(nrep,{
  
#Creates ry.total and ru.total objects for later in the loop
ry.total=0
ru.total=0

#Cuts data into 5 groups
folds.initial = cut(seq(1,nrow(mySim$Y)),breaks=5,labels=FALSE)
#Randomizes the grouping to get a different result each time
folds = sample(folds.initial,length(folds.initial),replace=FALSE)
for(i in 1:5){
  #Selects one group as test group for each of five folds
  #The test group rotates through each of the five folds in each of the loop's 5 iterations
  #This is known as "k-fold cross-validation" where k=5 in this example
  #A running average of phenotype and BV correlation is kept and reported
  testIndexes=which(folds==i,arr.ind=TRUE)
  M.test = as.matrix(X[testIndexes,])
  M.train = as.matrix(X[-testIndexes,])
  y.test = mySim$Y[testIndexes,][,2]
  y.train = mySim$Y[-testIndexes,][,2]
  #Ridge Regression
  blup.train = mixed.solve(y=y.train,Z=M.train)
  GEBV.train<- M%*%blup.train$u
  #Calculate correlations
  ry.iteration=cor(GEBV.train[testIndexes],mySim$Y[testIndexes,2])^2
  ru.iteration=cor(GEBV.train[testIndexes],mySim$u[testIndexes])^2
  #The "total" value is the current average of each r^2 value
  #It is updated with the average re-calculated in each iteration
  #Alternately, values could be added to a vector that is then averaged at the end
  ry.total=((ry.total*i)+ry.iteration)/i
  ru.total=((ru.total*i)+ru.iteration)/i
}
return(list(phen.5=ry.total,BV.5=ru.total))
})

#Takes averages/sd

phen.avg.Q5=mean(as.numeric(rrBLUP.Q5[1,]))
phen.sd.Q5=sd(as.numeric(rrBLUP.Q5[1,]))
BV.avg.Q5=mean(as.numeric(rrBLUP.Q5[2,]))
BV.sd.Q5=sd(as.numeric(rrBLUP.Q5[2,]))