# Homework assignment 6, CROPS545 Statistical Genomics by Ryan Oliveira setwd("~/R/RyanDirectory/Homework6") #Installs necessary packages library(mgcv) source("http://www.bioconductor.org/biocLite.R") biocLite("multtest") install.packages("gplots") install.packages("LDheatmap") install.packages("genetics") install.packages("EMMREML") install.packages("scatterplot3d") #Imports the installed packages library('MASS') # required for ginv library(multtest) library(gplots) library(LDheatmap) library(genetics) library(EMMREML) library(compiler) #this library is already installed in R library("scatterplot3d") # Imports data, source code for exercise source("http://zzlab.net/GAPIT/gapit_functions.txt") source("http://zzlab.net/GAPIT/emma.txt") myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) myG=read.table(file="http://zzlab.net/GAPIT/data/mdp_genotype_test.hmp.txt",head=T) source("http://zzlab.net/StaGen/2017/R/G2P.R") X=myGD[,-1] #--------------Part 1: Simulate 20 QTNs---------------# #Simulates 20 QTNs with normally distributed effects set.seed=78521 #The ZIP code for Brownsville, TX, which is much warmer this time of year mySim = GAPIT.Phenotype.Simulation( GD=myGD, GM=myGM, h2=0.50, NQTN=20, QTNDist="normal" ) #Plots the distribution of QTN effects plot(mySim$QTN.position,mySim$effect, xlab="QTN position", ylab="QTN effect") hist(mySim$effect,xlab="QTN effect",main="QTN effect distribution") #Plots genetic effect (u) against phenotype (Y) and displays correlation ru=cor(mySim$u,mySim$Y$V1) plot(mySim$u,mySim$Y$V1, xlab="Breeding value", ylab="Phenotype") mtext(paste("R square=",ru,sep=""),side=3) #--------Part 2: Marker selection and cross validation--------# #Source the FarmCPU code source("http://www.zzlab.net/FarmCPU/FarmCPU_functions.txt") #Installs necessary packages install.packages("bigmemory") install.packages("biganalytics") #Libraries packages library("bigmemory") library("biganalytics") #To get PC via GAPIT myGAPIT0=GAPIT(GD=myGD,GM=myGM,PCA.total=4) myPC=as.matrix(myGAPIT0$PCA[,-1]) #Makes a Y table to fit FarmCPU myY=data.frame(v1=myGD[,1],v2=mySim$Y$V1) #Runs FarmCPU myGWAS = FarmCPU( Y=myY, GD=myGD, GM=myGM, CV=myPC ) #Choose the top twenty SNPs ntop=20 index=order(myGWAS$GWAS$P.value) top=index[1:ntop] myQTN=cbind(myGD[,1],myGAPIT0$PCA[,-1], myGD[,c(top+1)]) #Loops a cross-validation function thirty times Prediction=replicate(30,{ #Selects 140 random individuals designed by row # to be testing, rest training #One individual is dropped from training to make the numbers equal for correlation later n=nrow(myY) testing=sample(n,round(n/2),replace=F) training.initial=setdiff(seq(1:n),testing) remove.row=sample(length(training.initial),1) training=training.initial[-remove.row] #Runs GAPIT and assesses prediction accuracy myGAPIT <- GAPIT( Y=mySim$Y[training,], GD=myGD, GM=myGM, CV=myQTN, group.from=1, group.to=1, group.by=1, SNP.test=FALSE, memo="GLM+QTN") #Calculates the effect from the PCA + 20 top SNPs effect=myGAPIT$effect.cv #Predicts phenotypes by multiplying genotypes of 20 SNPs by estimated effects effect.matrix=as.matrix(cbind(1,myQTN[,-1])) Pred=effect.matrix%*%effect #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population ry2=cor(Pred[testing],mySim$Y[testing,2])^2 ru2=cor(Pred[testing],mySim$u[testing])^2 return(list(phen = ry2, BV=ru2)) }) #Takes averages phen.mean=mean(as.numeric(Prediction[1,])) phen.sd=sd(as.numeric(Prediction[1,])) BV.mean=mean(as.numeric(Prediction[2,])) BV.sd=sd(as.numeric(Prediction[2,])) #--------Part 3: MAS/CV with random phenotypic shuffling--------# #Source the FarmCPU code source("http://www.zzlab.net/FarmCPU/FarmCPU_functions.txt") #Installs necessary packages install.packages("bigmemory") install.packages("biganalytics") #Libraries packages library("bigmemory") library("biganalytics") #Simulates 20 QTNs with normally distributed effects set.seed=78521 #The ZIP code for Brownsville, TX, which is much warmer this time of year mySim = GAPIT.Phenotype.Simulation( GD=myGD, GM=myGM, h2=0.50, NQTN=20, QTNDist="normal" ) #Makes a Y table to fit FarmCPU myY=data.frame(v1=myGD[,1],v2=mySim$Y$V1) #Randomly re-samples rows from the phenotype set rows=sample(length(myY[,1]),replace=FALSE) #Creates a shuffled phenotype data frame where the phenotypes do not match the QTN effects myShuffle=data.frame(v1=myGD[,1],v2=myY[rows,2]) #To get PC via GAPIT myGAPIT0=GAPIT(GD=myGD,GM=myGM,PCA.total=4) myPC=as.matrix(myGAPIT0$PCA[,-1]) #Runs FarmCPU myGWAS = FarmCPU( Y=myShuffle, GD=myGD, GM=myGM, CV=myPC ) #Choose the top twenty SNPs ntop=20 index=order(myGWAS$GWAS$P.value) top=index[1:ntop] myQTN=cbind(myGD[,1],myGAPIT0$PCA[,-1], myGD[,c(top+1)]) #Loops a cross-validation function thirty times Prediction=replicate(30,{ #Selects 140 random individuals designed by row # to be testing, rest training #One individual is dropped from training to make the numbers equal for correlation later n=nrow(myShuffle) testing=sample(n,round(n/2),replace=F) training.initial=setdiff(seq(1:n),testing) remove.row=sample(length(training.initial),1) training=training.initial[-remove.row] #Runs GAPIT and assesses prediction accuracy myGAPIT <- GAPIT( Y=myShuffle[training,], GD=myGD, GM=myGM, CV=myQTN, group.from=1, group.to=1, group.by=1, SNP.test=FALSE, memo="GLM+QTN") #Calculates the effect from the PCA + 20 top SNPs effect=myGAPIT$effect.cv #Predicts phenotypes by multiplying genotypes of 20 SNPs by estimated effects effect.matrix=as.matrix(cbind(1,myQTN[,-1])) Pred=effect.matrix%*%effect #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population ry2=cor(Pred[testing],myShuffle[testing,2])^2 ru2=cor(Pred[testing],mySim$u[testing])^2 return(list(phen = ry2, BV=ru2)) }) #Takes averages phen.mean=mean(as.numeric(Prediction[1,])) phen.sd=sd(as.numeric(Prediction[1,])) BV.mean=mean(as.numeric(Prediction[2,])) BV.sd=sd(as.numeric(Prediction[2,])) #--------Part 4: gBLUP in training vs. testing population--------# #Simulates 20 QTNs with normally distributed effects set.seed=96701 #The ZIP code for Oahu, Hawaii mySim = GAPIT.Phenotype.Simulation( GD=myGD, GM=myGM, h2=0.50, NQTN=20, QTNDist="normal" ) #Loops a gBLUP function thirty times gBLUP=replicate(30,{ #Selects 20% random individuals designed by row # to be testing, rest training n=nrow(myShuffle) testing=sample(n,round(n/5),replace=F) training=setdiff(seq(1:n),testing) #Runs gBLUP mygBLUP=GAPIT( Y=mySim$Y[training,], GD=myGD, GM=myGM, PCA.total=3, group.from=1000, group.to=1000, group.by=10, SNP.test=FALSE, memo="gBLUP" ) #Performs correlation ry2.blup.train=cor(mygBLUP$Pred[training,5],mySim$Y[training,2])^2 ru2.blup.train=cor(mygBLUP$Pred[training,5],mySim$u[training])^2 ry2.blup.test=cor(mygBLUP$Pred[testing,5],mySim$Y[testing,2])^2 ru2.blup.test=cor(mygBLUP$Pred[testing,5],mySim$u[testing])^2 return(list(phen.train = ry2.blup.train, BV.train = ru2.blup.train, phen.test = ry2.blup.test, BV.test = ru2.blup.test)) }) #Takes averages phen.mean.train=mean(as.numeric(gBLUP[1,])) phen.sd.train=sd(as.numeric(gBLUP[1,])) BV.mean.train=mean(as.numeric(gBLUP[2,])) BV.sd.train=sd(as.numeric(gBLUP[2,])) phen.mean.test=mean(as.numeric(gBLUP[3,])) phen.sd.test=sd(as.numeric(gBLUP[3,])) BV.mean.test=mean(as.numeric(gBLUP[4,])) BV.sd.test=sd(as.numeric(gBLUP[4,])) #--------Part 5: 5-fold CV and rrBLUP--------# #Import rrBLUP install.packages("rrBLUP") library(rrBLUP) #prepare data y = mySim$Y[,2] M = as.matrix(X) #Sets the function to repeat 30 times set.seed(02145) #ZIP code for Somerville, MA nrep=30 rrBLUP.Q5=replicate(nrep,{ #Creates ry.total and ru.total objects for later in the loop ry.total=0 ru.total=0 #Cuts data into 5 groups folds.initial = cut(seq(1,nrow(mySim$Y)),breaks=5,labels=FALSE) #Randomizes the grouping to get a different result each time folds = sample(folds.initial,length(folds.initial),replace=FALSE) for(i in 1:5){ #Selects one group as test group for each of five folds #The test group rotates through each of the five folds in each of the loop's 5 iterations #This is known as "k-fold cross-validation" where k=5 in this example #A running average of phenotype and BV correlation is kept and reported testIndexes=which(folds==i,arr.ind=TRUE) M.test = as.matrix(X[testIndexes,]) M.train = as.matrix(X[-testIndexes,]) y.test = mySim$Y[testIndexes,][,2] y.train = mySim$Y[-testIndexes,][,2] #Ridge Regression blup.train = mixed.solve(y=y.train,Z=M.train) GEBV.train<- M%*%blup.train$u #Calculate correlations ry.iteration=cor(GEBV.train[testIndexes],mySim$Y[testIndexes,2])^2 ru.iteration=cor(GEBV.train[testIndexes],mySim$u[testIndexes])^2 #The "total" value is the current average of each r^2 value #It is updated with the average re-calculated in each iteration #Alternately, values could be added to a vector that is then averaged at the end ry.total=((ry.total*i)+ry.iteration)/i ru.total=((ru.total*i)+ru.iteration)/i } return(list(phen.5=ry.total,BV.5=ru.total)) }) #Takes averages/sd phen.avg.Q5=mean(as.numeric(rrBLUP.Q5[1,])) phen.sd.Q5=sd(as.numeric(rrBLUP.Q5[1,])) BV.avg.Q5=mean(as.numeric(rrBLUP.Q5[2,])) BV.sd.Q5=sd(as.numeric(rrBLUP.Q5[2,]))