################################Loading data and preparing packages###########################

#getting the genotype file to simulate from
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)

#installing rrBlup package for problem 5
install.packages("rrBLUP")

#loading rrBLUP
library(rrBLUP)

##need this for gapit to work right specifically the van raden function 
library(compiler)

#loading library for heatmap in gapit
library(gplots)

#sourcing emma from zzlab.net
source("http://www.zzlab.net/GAPIT/emma.txt")

#sourcing GAPIT from zzlab.net
source("http://www.zzlab.net/GAPIT/gapit_functions.txt")

#setting things up for phenotype simulation
X=myGD[,-1]
index1to5=myGM[,2]<6
X1to5 = X[,index1to5]
taxa=myGD[,1]
#set.seed(99164)
# Save the markers on chromosomes 1-5 with individual names.
GD.candidate=cbind(taxa,X1to5)

#using gapit to simulate phenotypes
mySim=GAPIT.Phenotype.Simulation(GD=GD.candidate,GM=myGM[index1to5,],h2=.50,NQTN=20,QTNDist="normal")

#plotting distribution of of the QTN effects 
hist(mySim$effect, main = "Histogram of QTN effects")

#plotting effects by QTN
plot(mySim$QTN.position, mySim$effect, ylab = "effect of QTN", xlab = "position", main = "Effect by QTN ppostion")

#ploting correlations 
uYcorr = cor(mySim$u, mySim$Y$V1)
r_sq = uYcorr^2

#plots the two variables against eachother
plot(mySim$u, mySim$Y$V1, ylab = "phenotype", xlab = "genetic effect", main = "PHENOTYPE & BREEDING VALUE CORRELATION")

#paste text onto side 4 for r^2 value
mtext(paste("R^2", r_sq, sep = " "), side = 4)

#########################################Problem 2 #########################################

##sourcing farm_cpu from zzlab.net
source("http://www.zzlab.net/FarmCPU/FarmCPU_functions.txt")

#loading needed libraries
library("bigmemory")
library("biganalytics")

#using gapit to get PCA's to include into model
myGAPIT0=GAPIT(GD=myGD,GM=myGM,PCA.total=4)

#sets PC's as matrix
myPC=as.matrix(myGAPIT0$PCA[,-1])

#Makes a Y table to fit FarmCPU
myY=data.frame(v1=myGD[,1],v2=mySim$Y$V1)

#Runs FarmCPU
myGWAS = FarmCPU(
  Y=myY,
  GD=myGD,
  GM=myGM,
  CV=myPC
)

#finding the top 20 snp
ntop=20
index=order(myGWAS$GWAS$P.value)
top=index[1:ntop]

#makes matrix of QTN for later
myQTN=cbind(myGD[,1],myGAPIT0$PCA[,-1], myGD[,c(top+1)])

#Loops a cross-validation function thirty times
crossVal_2=replicate(30,{
  #Selects 140 random individuals designed by row # to be testing, rest training
  #One individual is dropped from training to make the numbers equal for correlation later
  n=nrow(myY)
  #setting up the sampling
  testing=sample(n,round(n/2),replace=F)
  training.initial=setdiff(seq(1:n),testing)
  
  #randomly removing a row due to unEven num individuals
  remove.row=sample(length(training.initial),1)
  training=training.initial[-remove.row]
  #Runs GAPIT and assesses prediction accuracy
  myGAPIT <- GAPIT(
    Y=mySim$Y[training,],
    GD=myGD,
    GM=myGM,
    CV=myQTN,
    group.from=1,
    group.to=1,
    group.by=1,
    SNP.test=FALSE,
    memo="GLM+QTN")
  #Calculates the effect from the PCA + 20 top SNPs
  effect=myGAPIT$effect.cv
  #Predicts phenotypes by multiplying genotypes of 20 SNPs by estimated effects
  effect.matrix=as.matrix(cbind(1,myQTN[,-1]))
  Pred=effect.matrix%*%effect
  #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population
  ry2=cor(Pred[testing],mySim$Y[testing,2])^2
  ru2=cor(Pred[testing],mySim$u[testing])^2
  return(list(phen = ry2, BV=ru2))
})
crossVal_2

#mean of phenotype correlation
pMeanCor = mean(as.numeric(crossVal_2[1, ]))
pMeanCor

#standard deviation of phenotype correlation
pSdCor = sd(as.numeric(crossVal_2[1, ]))
pSdCor

#mean of breeding value correlation
bMeanCor = mean(as.numeric(crossVal_2[2, ]))
bMeanCor

#standard deviation of breeding value correlation
bSdCor = sd(as.numeric(crossVal_2[2, ]))
bSdCor
#########################Problem 3######################


#reshuffling phenotypes and doing stuff that was not done in problem 2
#Randomly re-samples rows from the phenotype set
rows=sample(length(myY[,1]),replace=FALSE)
#Creates a shuffled phenotype data frame where the phenotypes do not match the QTN effects
myShuffle=data.frame(v1=myGD[,1],v2=myY[rows,2])

cross_val_3=replicate(30,{
  #Selects 140 random individuals designed by row # to be testing, rest training
  #One individual is dropped from training to make the numbers equal for correlation later
  n=nrow(myShuffle)
  testing=sample(n,round(n/2),replace=F)
  training.initial=setdiff(seq(1:n),testing)
  remove.row=sample(length(training.initial),1)
  training=training.initial[-remove.row]
  #Runs GAPIT and assesses prediction accuracy
  myGAPIT <- GAPIT(
    Y=myShuffle[training,],
    GD=myGD,
    GM=myGM,
    CV=myQTN,
    group.from=1,
    group.to=1,
    group.by=1,
    SNP.test=FALSE,
    memo="GLM+QTN")
  #Calculates the effect from the PCA + 20 top SNPs
  effect=myGAPIT$effect.cv
  #Predicts phenotypes by multiplying genotypes of 20 SNPs by estimated effects
  effect.matrix=as.matrix(cbind(1,myQTN[,-1]))
  Pred=effect.matrix%*%effect
  #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population
  ry2=cor(Pred[testing],myShuffle[testing,2])^2
  ru2=cor(Pred[testing],mySim$u[testing])^2
  return(list(phen = ry2, BV=ru2))
})
cross_val_3

#mean value of phenotype cor
p3CorMean = mean(as.numeric(cross_val_3[1,]))
p3CorMean
#sd of phenotypes cor
p3CorSd = sd(as.numeric(cross_val_3[1,]))
p3CorSd
#mean value of breeding values cor
b3CorMean = mean(as.numeric(cross_val_3[2,]))
b3CorMean
#sd of breeding value cor
b3CorSd = sd(as.numeric(cross_val_3[2, ]))
b3CorSd
################################Problem 4 #####################################

#set number for the number of reps
reps = 30

#now perform the replications
Gblup_reps = replicate(reps, {
  
  #sample 20% as testing
  testing = sample(n, round(n/5), replace = FALSE)
  
  #anything indexes not testing will be set to training with setdiff function
  training = setdiff(seq(1:n), testing)
  
  #actually doing Gblup
  mygBLUP=GAPIT(
    Y=mySim$Y[training,],
    GD=myGD,
    GM=myGM,
    PCA.total=3,
    group.from=1000,
    group.to=1000,
    group.by=10,
    SNP.test=FALSE,
    memo="gBLUP"
  )
  
  #getting correlations, Not sure why ryan got R^2 in 2017 hwk but i wont do that
  ry2.blup.train=cor(mygBLUP$Pred[training,5],mySim$Y[training,2])
  ru2.blup.train=cor(mygBLUP$Pred[training,5],mySim$u[training])
  ry2.blup.test=cor(mygBLUP$Pred[testing,5],mySim$Y[testing,2])
  ru2.blup.test=cor(mygBLUP$Pred[testing,5],mySim$u[testing])
  return(list(phen.train = ry2.blup.train, BV.train = ru2.blup.train, phen.test = ry2.blup.test, BV.test = ru2.blup.test))
  
})
Gblup_reps
#mean pheno
meanPhenoTrain = mean(as.numeric(Gblup_reps[1,]))
meanPhenoTrain
#sd pheno
sdPhenoTrain = sd(as.numeric(Gblup_reps[1,]))
sdPhenoTrain
#mean BV
meanBVTrain = mean(as.numeric(Gblup_reps[2, ]))
meanBVTrain
#sd BV
sdBVTrain = sd(as.numeric((Gblup_reps[2, ])))
sdBVTrain

meanPhenoTest = mean(as.numeric(Gblup_reps[3,]))
meanPhenoTest
#sd pheno
sdPhenoTest = sd(as.numeric(Gblup_reps[3,]))
sdPhenoTest
#mean BV
meanBVTest = mean(as.numeric(Gblup_reps[4, ]))
meanBVTest
#sd BV
sdBVTest = sd(as.numeric((Gblup_reps[4, ])))
sdBVTest

hist(as.numeric(Gblup_reps[3, ]))
####################################Problem 5 ###########################

#preparing a dataframe
#remove first column
X = myGD[, -1]

#prepare data
y = mySim$Y[,2]
M = as.matrix(X)

#set reps
reps=30
rrBLUP_reps=replicate(reps,{
  
  #objects to keep adding too inside of loop initialized to zero
  ry_increaser=0
  ru_increaser=0
  
  #break data into 5 groups (its actually the indicies notice the seq)
  slices = cut(seq(1,nrow(mySim$Y)),breaks=5,labels=FALSE)
  
  #pull one of the initial folds (a slice) out using sample function
  folds = sample(slices,length(slices),replace=FALSE)
  
  #one rep for each slice
  for(i in 1:5){
    #selects one test group then test on all 5 folds
    #the rotates through all five as test group
    testIndexes=which(folds==i,arr.ind=TRUE)
    M.test = as.matrix(X[testIndexes,])
    M.train = as.matrix(X[-testIndexes,])
    y.test = mySim$Y[testIndexes,][,2]
    y.train = mySim$Y[-testIndexes,][,2]
    
    #Solve using ridge regression
    blup.train = mixed.solve(y=y.train,Z=M.train)
    
    #use blups to calculate effect
    GEBV.train<- M%*%blup.train$u
    
    #Calculate correlations
    ry.iCor=cor(GEBV.train[testIndexes],mySim$Y[testIndexes,2])
    ru.iCor=cor(GEBV.train[testIndexes],mySim$u[testIndexes])
    #The "total" value is the current average of each r value
    #It is updated with the average re-calculated in each iteration
    #Alternately, values could be added to a vector that is then averaged at the end
    ry_increaser=ry_increaser+ry.iCor
    ru_increaser=ru_increaser+ru.iCor
  }
  #devide the added up correlations by the number of loops
  ry_increaser = ry_increaser / 5
  ru_increaser = ru_increaser / 5
  return(list(pheno_5_cor=ry_increaser,breedVal_5_cor=ru_increaser))
})
rrBLUP_reps

#time to get phenotype cor mean
#set list to nuemeric and take mean and use index to access list
rrblupPhenoMean = mean(as.numeric(rrBLUP_reps[1,]))
rrblupPhenoMean

rrblupPhenoSD = sd(as.numeric(rrBLUP_reps[1,]))
rrblupPhenoSD

rrblupBVMean = mean(as.numeric(rrBLUP_reps[2,]))
rrblupBVMean

rrblupBVSD = sd(as.numeric(rrBLUP_reps[2,]))
rrblupBVSD