################################Loading data and preparing packages########################### #getting the genotype file to simulate from myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) #installing rrBlup package for problem 5 install.packages("rrBLUP") #loading rrBLUP library(rrBLUP) ##need this for gapit to work right specifically the van raden function library(compiler) #loading library for heatmap in gapit library(gplots) #sourcing emma from zzlab.net source("http://www.zzlab.net/GAPIT/emma.txt") #sourcing GAPIT from zzlab.net source("http://www.zzlab.net/GAPIT/gapit_functions.txt") #setting things up for phenotype simulation X=myGD[,-1] index1to5=myGM[,2]<6 X1to5 = X[,index1to5] taxa=myGD[,1] #set.seed(99164) # Save the markers on chromosomes 1-5 with individual names. GD.candidate=cbind(taxa,X1to5) #using gapit to simulate phenotypes mySim=GAPIT.Phenotype.Simulation(GD=GD.candidate,GM=myGM[index1to5,],h2=.50,NQTN=20,QTNDist="normal") #plotting distribution of of the QTN effects hist(mySim$effect, main = "Histogram of QTN effects") #plotting effects by QTN plot(mySim$QTN.position, mySim$effect, ylab = "effect of QTN", xlab = "position", main = "Effect by QTN ppostion") #ploting correlations uYcorr = cor(mySim$u, mySim$Y$V1) r_sq = uYcorr^2 #plots the two variables against eachother plot(mySim$u, mySim$Y$V1, ylab = "phenotype", xlab = "genetic effect", main = "PHENOTYPE & BREEDING VALUE CORRELATION") #paste text onto side 4 for r^2 value mtext(paste("R^2", r_sq, sep = " "), side = 4) #########################################Problem 2 ######################################### ##sourcing farm_cpu from zzlab.net source("http://www.zzlab.net/FarmCPU/FarmCPU_functions.txt") #loading needed libraries library("bigmemory") library("biganalytics") #using gapit to get PCA's to include into model myGAPIT0=GAPIT(GD=myGD,GM=myGM,PCA.total=4) #sets PC's as matrix myPC=as.matrix(myGAPIT0$PCA[,-1]) #Makes a Y table to fit FarmCPU myY=data.frame(v1=myGD[,1],v2=mySim$Y$V1) #Runs FarmCPU myGWAS = FarmCPU( Y=myY, GD=myGD, GM=myGM, CV=myPC ) #finding the top 20 snp ntop=20 index=order(myGWAS$GWAS$P.value) top=index[1:ntop] #makes matrix of QTN for later myQTN=cbind(myGD[,1],myGAPIT0$PCA[,-1], myGD[,c(top+1)]) #Loops a cross-validation function thirty times crossVal_2=replicate(30,{ #Selects 140 random individuals designed by row # to be testing, rest training #One individual is dropped from training to make the numbers equal for correlation later n=nrow(myY) #setting up the sampling testing=sample(n,round(n/2),replace=F) training.initial=setdiff(seq(1:n),testing) #randomly removing a row due to unEven num individuals remove.row=sample(length(training.initial),1) training=training.initial[-remove.row] #Runs GAPIT and assesses prediction accuracy myGAPIT <- GAPIT( Y=mySim$Y[training,], GD=myGD, GM=myGM, CV=myQTN, group.from=1, group.to=1, group.by=1, SNP.test=FALSE, memo="GLM+QTN") #Calculates the effect from the PCA + 20 top SNPs effect=myGAPIT$effect.cv #Predicts phenotypes by multiplying genotypes of 20 SNPs by estimated effects effect.matrix=as.matrix(cbind(1,myQTN[,-1])) Pred=effect.matrix%*%effect #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population ry2=cor(Pred[testing],mySim$Y[testing,2])^2 ru2=cor(Pred[testing],mySim$u[testing])^2 return(list(phen = ry2, BV=ru2)) }) crossVal_2 #mean of phenotype correlation pMeanCor = mean(as.numeric(crossVal_2[1, ])) pMeanCor #standard deviation of phenotype correlation pSdCor = sd(as.numeric(crossVal_2[1, ])) pSdCor #mean of breeding value correlation bMeanCor = mean(as.numeric(crossVal_2[2, ])) bMeanCor #standard deviation of breeding value correlation bSdCor = sd(as.numeric(crossVal_2[2, ])) bSdCor #########################Problem 3###################### #reshuffling phenotypes and doing stuff that was not done in problem 2 #Randomly re-samples rows from the phenotype set rows=sample(length(myY[,1]),replace=FALSE) #Creates a shuffled phenotype data frame where the phenotypes do not match the QTN effects myShuffle=data.frame(v1=myGD[,1],v2=myY[rows,2]) cross_val_3=replicate(30,{ #Selects 140 random individuals designed by row # to be testing, rest training #One individual is dropped from training to make the numbers equal for correlation later n=nrow(myShuffle) testing=sample(n,round(n/2),replace=F) training.initial=setdiff(seq(1:n),testing) remove.row=sample(length(training.initial),1) training=training.initial[-remove.row] #Runs GAPIT and assesses prediction accuracy myGAPIT <- GAPIT( Y=myShuffle[training,], GD=myGD, GM=myGM, CV=myQTN, group.from=1, group.to=1, group.by=1, SNP.test=FALSE, memo="GLM+QTN") #Calculates the effect from the PCA + 20 top SNPs effect=myGAPIT$effect.cv #Predicts phenotypes by multiplying genotypes of 20 SNPs by estimated effects effect.matrix=as.matrix(cbind(1,myQTN[,-1])) Pred=effect.matrix%*%effect #Correlates the predicted phenotypes against breeding value and actual phenotype in testing population ry2=cor(Pred[testing],myShuffle[testing,2])^2 ru2=cor(Pred[testing],mySim$u[testing])^2 return(list(phen = ry2, BV=ru2)) }) cross_val_3 #mean value of phenotype cor p3CorMean = mean(as.numeric(cross_val_3[1,])) p3CorMean #sd of phenotypes cor p3CorSd = sd(as.numeric(cross_val_3[1,])) p3CorSd #mean value of breeding values cor b3CorMean = mean(as.numeric(cross_val_3[2,])) b3CorMean #sd of breeding value cor b3CorSd = sd(as.numeric(cross_val_3[2, ])) b3CorSd ################################Problem 4 ##################################### #set number for the number of reps reps = 30 #now perform the replications Gblup_reps = replicate(reps, { #sample 20% as testing testing = sample(n, round(n/5), replace = FALSE) #anything indexes not testing will be set to training with setdiff function training = setdiff(seq(1:n), testing) #actually doing Gblup mygBLUP=GAPIT( Y=mySim$Y[training,], GD=myGD, GM=myGM, PCA.total=3, group.from=1000, group.to=1000, group.by=10, SNP.test=FALSE, memo="gBLUP" ) #getting correlations, Not sure why ryan got R^2 in 2017 hwk but i wont do that ry2.blup.train=cor(mygBLUP$Pred[training,5],mySim$Y[training,2]) ru2.blup.train=cor(mygBLUP$Pred[training,5],mySim$u[training]) ry2.blup.test=cor(mygBLUP$Pred[testing,5],mySim$Y[testing,2]) ru2.blup.test=cor(mygBLUP$Pred[testing,5],mySim$u[testing]) return(list(phen.train = ry2.blup.train, BV.train = ru2.blup.train, phen.test = ry2.blup.test, BV.test = ru2.blup.test)) }) Gblup_reps #mean pheno meanPhenoTrain = mean(as.numeric(Gblup_reps[1,])) meanPhenoTrain #sd pheno sdPhenoTrain = sd(as.numeric(Gblup_reps[1,])) sdPhenoTrain #mean BV meanBVTrain = mean(as.numeric(Gblup_reps[2, ])) meanBVTrain #sd BV sdBVTrain = sd(as.numeric((Gblup_reps[2, ]))) sdBVTrain meanPhenoTest = mean(as.numeric(Gblup_reps[3,])) meanPhenoTest #sd pheno sdPhenoTest = sd(as.numeric(Gblup_reps[3,])) sdPhenoTest #mean BV meanBVTest = mean(as.numeric(Gblup_reps[4, ])) meanBVTest #sd BV sdBVTest = sd(as.numeric((Gblup_reps[4, ]))) sdBVTest hist(as.numeric(Gblup_reps[3, ])) ####################################Problem 5 ########################### #preparing a dataframe #remove first column X = myGD[, -1] #prepare data y = mySim$Y[,2] M = as.matrix(X) #set reps reps=30 rrBLUP_reps=replicate(reps,{ #objects to keep adding too inside of loop initialized to zero ry_increaser=0 ru_increaser=0 #break data into 5 groups (its actually the indicies notice the seq) slices = cut(seq(1,nrow(mySim$Y)),breaks=5,labels=FALSE) #pull one of the initial folds (a slice) out using sample function folds = sample(slices,length(slices),replace=FALSE) #one rep for each slice for(i in 1:5){ #selects one test group then test on all 5 folds #the rotates through all five as test group testIndexes=which(folds==i,arr.ind=TRUE) M.test = as.matrix(X[testIndexes,]) M.train = as.matrix(X[-testIndexes,]) y.test = mySim$Y[testIndexes,][,2] y.train = mySim$Y[-testIndexes,][,2] #Solve using ridge regression blup.train = mixed.solve(y=y.train,Z=M.train) #use blups to calculate effect GEBV.train<- M%*%blup.train$u #Calculate correlations ry.iCor=cor(GEBV.train[testIndexes],mySim$Y[testIndexes,2]) ru.iCor=cor(GEBV.train[testIndexes],mySim$u[testIndexes]) #The "total" value is the current average of each r value #It is updated with the average re-calculated in each iteration #Alternately, values could be added to a vector that is then averaged at the end ry_increaser=ry_increaser+ry.iCor ru_increaser=ru_increaser+ru.iCor } #devide the added up correlations by the number of loops ry_increaser = ry_increaser / 5 ru_increaser = ru_increaser / 5 return(list(pheno_5_cor=ry_increaser,breedVal_5_cor=ru_increaser)) }) rrBLUP_reps #time to get phenotype cor mean #set list to nuemeric and take mean and use index to access list rrblupPhenoMean = mean(as.numeric(rrBLUP_reps[1,])) rrblupPhenoMean rrblupPhenoSD = sd(as.numeric(rrBLUP_reps[1,])) rrblupPhenoSD rrblupBVMean = mean(as.numeric(rrBLUP_reps[2,])) rrblupBVMean rrblupBVSD = sd(as.numeric(rrBLUP_reps[2,])) rrblupBVSD