###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ### CROPS 545 ### ### Homework 6 ### ### Evan Craine ### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### #prepare the work space rm(list=ls()) #load the necessary packages library('MASS') library(multtest) library(gplots) library(compiler) library("scatterplot3d") library(bigmemory) library(biganalytics) library("EMMREML") library(rrBLUP) #load the necessary functions source("http://www.zzlab.net/GAPIT/emma.txt") source("http://www.zzlab.net/GAPIT/gapit_functions.txt") source("GAPIT.simu2.R.txt") #set the working directory, check it setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6") getwd() dir() #read in the necessary files myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T) #view each to familiarize with the data table contents and format View(myGD) #columns: taxa (281 individs) and 3093 SNPs View(myGM) #SNP, Chromosome, Position View(myCV) #Taxa, Factor A, Factor B ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ### Problem 1 ### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### set.seed(99163) mySim=GAPIT.Phenotype.Simulation( GD=myGD, GM=myGM, h2=.5, NQTN=20, #effectunit = .95, QTNDist="normal", cveff=c(.51,.51), CV=myCV ) #including cveff drastically changes the scatter plot effects <- mySim$u #set effect values equal to u from mySim myY <- mySim$Y #set phenotype values equal to Y from mySim r2=round(cor(effects,myY[,2])^2,3) #calcualte r2 and round to third decimal place #prepare the window area for the plot par(mfrow=c(1,1), mar = c(4.5,4.5,1.7,1)) #compare genetic effects and phenotype by plotting plot(effects,myY[,2], col="royalblue",ylab="Phenotypes",xlab="Genetic Effects",) legend("topleft", paste("R square=",r2,sep="")) #include the r squared value on the top left of the grapho #plot the distribution of QTN effects hist(mySim$effect, xlab="QTN effect",main="QTN Effect Distribution",col="royalblue") ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ### Problem 2 ### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### #Perform GWAS on the simulated phenotypes with all the individuals by using FarmCPU and selected the top 20 associated SNPs. #clear the workspace rm(list=ls()) #set the wd main="~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6" setwd(main) getwd() dir() #read in the necessary files myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T) #load the required functions source("GAPIT.simu2.R.txt") source("FarmCPU_functions.txt") #save as .txt from ZZlab if you don't already have in WD source("http://www.zzlab.net/GAPIT/emma.txt") source("http://www.zzlab.net/GAPIT/gapit_functions.txt") source("GAPIT.simu2.R.txt") #perform phenotype simulation set.seed(99163) mySim=GAPIT.Phenotype.Simulation( GD=myGD, GM=myGM, h2=0.50, NQTN=20, QTNDist = "normal") #store the phenotype myY <- mySim$Y #taxa and phenotype value #perform GWAS using GAPIT to generate PCA myGAPIT0 <- GAPIT( GD=myGD, GM=myGM, PCA.total=3, file.output = FALSE) #save the PCA myPC = as.matrix(myGAPIT0$PCA[,-1]) #output is taxa, PC1, PC2, and PC3 #performGWAS using FarmCPU myFarmCPU = FarmCPU( Y=mySim$Y, GD=myGD, GM=myGM, CV=myPC ) #save the pertinent info from GWAS analysis ntop=20 index=order(myFarmCPU$GWAS$P.value) top=index[1:ntop] myQTN1=cbind(myGD[,1],myGAPIT0$PCA[,-1], myGD[,c(top+1)]) #has taxa #perform the replications Prediction=replicate(30,{ #Selects 140 random individuals designed by row # to be testing, rest training #One individual is dropped from training to make the numbers equal for correlation later n=nrow(myY) testing=sample(n,round(n/2),replace=F) #half selected for testing pop training.initial=setdiff(seq(1:n),testing) #half selected for training pop remove.row=sample(length(training.initial),1) training=training.initial[-remove.row] #gets the populations down to 140 individuals each #Runs GAPIT and assesses prediction accuracy myGAPIT <- GAPIT( Y=mySim$Y[training,], GD=myGD, GM=myGM, CV=myQTN1, #info from above group.from=1, group.to=1, group.by=1, SNP.test=FALSE, memo="GLM+QTN") #Calculates the effect from the PCA + 20 top SNPs effect=myGAPIT$effect.cv #Predicts phenotypes by multiplying genotypes of 20 SNPs by estimated effects effect.matrix=as.matrix(cbind(1,myQTN1[,-1])) Pred=effect.matrix%*%effect #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population ry2=cor(Pred[testing],mySim$Y[testing,2])^2 ru2=cor(Pred[testing],mySim$u[testing])^2 return(list(phen = ry2, BV=ru2)) }) pheno.mean=mean(as.numeric(Prediction[1,])) pheno.sd=sd(as.numeric(Prediction[1,])) BV.mean=mean(as.numeric(Prediction[2,])) BV.sd=sd(as.numeric(Prediction[2,])) ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ### Problem 3 ### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### #Perform GWAS on the simulated phenotypes with all the individuals by using FarmCPU and selected the top 20 associated SNPs. #clear the workspace rm(list=ls()) #set the wd main="~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6" setwd(main) getwd() dir() #read in the necessary files myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T) #load the required functions source("GAPIT.simu2.R.txt") source("FarmCPU_functions.txt") #save as .txt from ZZlab if you don't already have in WD source("http://www.zzlab.net/GAPIT/emma.txt") source("http://www.zzlab.net/GAPIT/gapit_functions.txt") source("GAPIT.simu2.R.txt") #perform phenotype simulation set.seed(99163) mySim=GAPIT.Phenotype.Simulation( GD=myGD, GM=myGM, h2=0.50, NQTN=20, QTNDist = "normal") #store the phenotype myY <- mySim$Y #taxa and phenotype value #shuffle the phenotypes rows=sample(length(myY[,1]),replace=FALSE) #randomly choose individuals (as row numbers) from myShuffle=data.frame(v1=myGD[,1],v2=myY[rows,2]) #create a new data frame, taxon doesn't match phenotpye #perform GWAS using GAPIT to generate PCA myGAPIT0 <- GAPIT( GD=myGD, GM=myGM, PCA.total=3, file.output = FALSE) #save the PCA myPC = as.matrix(myGAPIT0$PCA[,-1]) #output is taxa, PC1, PC2, and PC3 #performGWAS using FarmCPU myFarmCPU = FarmCPU( Y=myShuffle, GD=myGD, GM=myGM, CV=myPC ) #save the pertinent info from GWAS analysis ntop=20 index=order(myFarmCPU$GWAS$P.value) top=index[1:ntop] myQTN1=cbind(myGD[,1],myGAPIT0$PCA[,-1], myGD[,c(top+1)]) #has taxa #perform the replications Prediction=replicate(30,{ #Selects 140 random individuals designed by row # to be testing, rest training #One individual is dropped from training to make the numbers equal for correlation later n=nrow(myY) testing=sample(n,round(n/2),replace=F) #half selected for testing pop training.initial=setdiff(seq(1:n),testing) #half selected for training pop remove.row=sample(length(training.initial),1) training=training.initial[-remove.row] #gets the populations down to 140 individuals each #Runs GAPIT and assesses prediction accuracy myGAPIT <- GAPIT( Y=myShuffle[training,], GD=myGD, GM=myGM, CV=myQTN1, #info from above group.from=1, group.to=1, group.by=1, SNP.test=FALSE, memo="GLM+QTN") effect=myGAPIT$effect.cv #effect calculated from PCA and top 20 QTN effect.matrix=as.matrix(cbind(1,myQTN1[,-1]))#phenotypes predicted using genotypic data from top 20 QTN Pred=effect.matrix%*%effect #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population ry2=cor(Pred[testing],mySim$Y[testing,2])^2 ru2=cor(Pred[testing],mySim$u[testing])^2 return(list(phen = ry2, BV=ru2)) }) #calculate the mean and standard deviation values for each prediction pheno.mean=mean(as.numeric(Prediction[1,])) pheno.sd=sd(as.numeric(Prediction[1,])) BV.mean=mean(as.numeric(Prediction[2,])) BV.sd=sd(as.numeric(Prediction[2,])) ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ### Problem 3 ### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### #clear the workspace rm(list=ls()) #set the wd main="~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6" setwd(main) getwd() dir() #read in the necessary files myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T) #load the necessary packages library('MASS') library(multtest) library(gplots) library(compiler) library("scatterplot3d") library(bigmemory) library(biganalytics) library("EMMREML") library(rrBLUP) #load the required functions source("GAPIT.simu2.R.txt") source("FarmCPU_functions.txt") #save as .txt from ZZlab if you don't already have in WD source("http://www.zzlab.net/GAPIT/emma.txt") source("http://www.zzlab.net/GAPIT/gapit_functions.txt") source("GAPIT.simu2.R.txt") #simulate phenotypes with h2 = 50%, 20 QTN normally distributed mySim = GAPIT.Phenotype.Simulation( GD=myGD, GM=myGM, h2=0.50, NQTN=20, QTNDist = "normal" ) #store the phenotype myY <- mySim$Y #taxa and phenotype value #shuffle the phenotypes rows=sample(length(myY[,1]),replace=FALSE) #randomly choose individuals (as row numbers) from myShuffle=data.frame(v1=myGD[,1],v2=myY[rows,2]) #create a new data frame, taxon doesn't match phenotpye #perform GBLUP function thirty times gBLUP=replicate(30,{ #first divide the individuals n=nrow(myY) testing=sample(n,round(n/5),replace=F) #randomly selecting 20 individuals for the testing population training=setdiff(seq(1:n),testing) #set the rest of the individuals as the training population #now calculate gBLUP mygBLUP = GAPIT( y = mySim$Y[training,], GD = myGD, GM = myGM, PCA.total = 3, group.from = 1000, group.to = 1000, group.by = 10, SNP.test = FALSE, memo = "gBLUP" ) #now calculate the correlations in the training and testing pops ry2.blup.train = cor(mygBLUP$Pred[training,5],mySim$Y[training,2])^2 ru2.blup.train = cor(mygBLUP$Pred[training,5],mySim$u[training])^2 ry2.blup.test = cor(mygBLUP$Pred[testing,5],mySim$Y[testing,2])^2 ru2.blup.test = cor(mygBLUP$Pred[testing,5],mySim$u[testing])^2 return(list(phen.train = ry2.blup.train, BV.train = ru2.blup.train, phen.test = ry2.blup.test, BV.test = ru2.blup.test)) }) #now calculate the averages and the standard deviations phen.mean.train=mean(as.numeric(gBLUP[1,])) phen.sd.train=sd(as.numeric(gBLUP[1,])) BV.mean.train=mean(as.numeric(gBLUP[2,])) BV.sd.train=sd(as.numeric(gBLUP[2,])) phen.mean.test=mean(as.numeric(gBLUP[3,])) phen.sd.test=sd(as.numeric(gBLUP[3,])) BV.mean.test=mean(as.numeric(gBLUP[4,])) BV.sd.test=sd(as.numeric(gBLUP[4,])) ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ### Problem 4 ### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### #clear the workspace rm(list=ls()) #set the wd main="~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6" setwd(main) getwd() dir() #read in the necessary files myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T) #load the necessary packages library('MASS') library(multtest) library(gplots) library(compiler) library("scatterplot3d") library(bigmemory) library(biganalytics) library("EMMREML") library(rrBLUP) #load the required functions source("GAPIT.simu2.R.txt") source("FarmCPU_functions.txt") #save as .txt from ZZlab if you don't already have in WD source("http://www.zzlab.net/GAPIT/emma.txt") source("http://www.zzlab.net/GAPIT/gapit_functions.txt") source("GAPIT.simu2.R.txt") #simulate phenotypes with h2 = 50%, 20 QTN normally distributed mySim = GAPIT.Phenotype.Simulation( GD=myGD, GM=myGM, h2=0.50, NQTN=20, QTNDist = "normal" ) #store the phenotype myY <- mySim$Y #taxa and phenotype value #perform GBLUP function thirty times gBLUP=replicate(30,{ #first divide the individuals n=nrow(myY) testing=sample(n,round(n/5),replace=F) #randomly selecting 20 individuals for the testing population training=setdiff(seq(1:n),testing) #set the rest of the individuals as the training population #now calculate gBLUP mygBLUP = GAPIT( y = mySim$Y[training,], GD = myGD, GM = myGM, PCA.total = 3, group.from = 1000, group.to = 1000, group.by = 10, SNP.test = FALSE, memo = "gBLUP" ) #now calculate the correlations in the training and testing pops ry2.blup.train = cor(mygBLUP$Pred[training,5],mySim$Y[training,2])^2 ru2.blup.train = cor(mygBLUP$Pred[training,5],mySim$u[training])^2 ry2.blup.test = cor(mygBLUP$Pred[testing,5],mySim$Y[testing,2])^2 ru2.blup.test = cor(mygBLUP$Pred[testing,5],mySim$u[testing])^2 return(list(phen.train = ry2.blup.train, BV.train = ru2.blup.train, phen.test = ry2.blup.test, BV.test = ru2.blup.test)) }) #now calculate the averages and the standard deviations phen.mean.train=mean(as.numeric(gBLUP[1,])) phen.sd.train=sd(as.numeric(gBLUP[1,])) BV.mean.train=mean(as.numeric(gBLUP[2,])) BV.sd.train=sd(as.numeric(gBLUP[2,])) phen.mean.test=mean(as.numeric(gBLUP[3,])) phen.sd.test=sd(as.numeric(gBLUP[3,])) BV.mean.test=mean(as.numeric(gBLUP[4,])) BV.sd.test=sd(as.numeric(gBLUP[4,])) ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### ### Problem 5 ### ###~~~~~~~~~~~~~~~~~~~~~~~~~~~~### #clear the workspace rm(list=ls()) #set the wd main="~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW6" setwd(main) getwd() dir() #read in the necessary files myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T) #install and load the rrBLUP package install.packages("rrBLUP") library(rrBLUP) #load the necessary packages library('MASS') library(multtest) library(gplots) library(compiler) library("scatterplot3d") library(bigmemory) library(biganalytics) library("EMMREML") library(rrBLUP) #load the required functions source("GAPIT.simu2.R.txt") source("FarmCPU_functions.txt") #save as .txt from ZZlab if you don't already have in WD source("http://www.zzlab.net/GAPIT/emma.txt") source("http://www.zzlab.net/GAPIT/gapit_functions.txt") source("GAPIT.simu2.R.txt") #simulate phenotypes with h2 = 50%, 20 QTN normally distributed set.seed(99164) mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.5,NQTN=20, effectunit =.95,QTNDist="normal",CV=myCV,cveff=c(.51,.51)) myY <- mySim$Y #prepare for the task n=nrow(myY) M=as.matrix(myGD[,-1]) #this is teh genotypic data for all individuals at all markers #perform GAPIT to get the PCA myGAPIT=GAPIT(GD=myGD,GM=myGM, PCA.total=3,file.output=FALSE) pcEnv=cbind(myGAPIT$PCA[,-1],myCV[,-1]) #prepare for five fold cross validation task nrep=30 #declare the number of reps nfold=5 #declare the number of folds #prepare null vectors to output the results (accuracies) into cor_pheno <- NULL cor_gebv <- NULL for (rep in 1:nrep){ c.pheno <- NULL c.gebv <- NULL #setup fold myCut=cut(1:n,nfold,labels=FALSE) #create the folds fold=sample(myCut,n) #randomly sample from the folds for (f in 1:nfold){ print(c(rep,f)) testing=(fold==f) training=!testing #use rrBLUP package to perform the genomic selection task ans.RR <- mixed.solve(y=myY[training,2],Z=M[training,],X=pcEnv[training,]) fix_effect <- as.matrix(pcEnv) %*% ans.RR$beta ran_effect <- M %*% ans.RR$u pred_pheno <- fix_effect + ran_effect Pred_v_Pheno <- cor(pred_pheno[testing,],mySim$Y[testing,2]) Pred_v_GEBV <- cor(ran_effect[testing,],mySim$u[testing]) c.pheno <- c(c.pheno,Pred_v_Pheno) c.gebv <- c(c.gebv, Pred_v_GEBV) } final_pheno <- sum(c.pheno)/nfold #get out the final phenotypic values final_gebv <- sum(c.gebv)/nfold #get out the final genomic estimated breeding values cor_pheno <- c(cor_pheno,final_pheno) #calculate the accuracy between the actual phenotype and the predicted phenotype cor_gebv <- c(cor_gebv, final_gebv) #calculate the accuracy between the actual GEBV and the predicted GEBV }#rep #calculate the summary statistics m1 <- round(mean(cor_pheno),3) m1 #0.784 m2 <- round(mean(cor_gebv),3) m2 #0.434 s1 <- round(sd(cor_pheno),3) s1 #0.005 s2 <- round(sd(cor_gebv),3) s2 #0.019