#~~~~~Homework Five~~~~~# #~~~~~CROPS_545~~~~~# #~~~~~Evan Craine~~~~~# #set working directory setwd("~/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5") #read in files myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) X=myGD[,-1] n=nrow(X) m=ncol(X) #install the required packages install.packages("gplots") install.packages("LDheatmap") install.packages("genetics") install.packages("ape") install.packages("EMMREML") install.packages("scatterplot3d") #load the required packages library(multtest) library(gplots) library(LDheatmap) library(genetics) library(ape) library(EMMREML) library(compiler) library(mgcv) library(dplyr) library(plyr) library(scatterplot3d) library(MASS) library(multtest) #load the source code source("http://zzlab.net/GAPIT/gapit_functions.txt") #make sure compiler is running source("http://zzlab.net/GAPIT/emma.txt") source("GWASxPCA_GLM.R") source("G2P.R") source("https://bioconductor.org/biocLite.R") biocLite("multtest") setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5") ######################################################################################## #~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~Question I~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~# ######################################################################################## #Sample number of QTNs of your choice from the genetic markers used in homework2 and simulate QTN effects from a standard normal distribution. #Assign genetic effects for each individual. #Simulate normal distributed residual effects with appropriate variance to have a heritability of 0.75. #Add residual effects to the genetic effects to create phenotypes. set.seed(99163) mySim<-G2P(X,h2=.75,alpha=1,NQTN=10,distribution="normal") QTN.position<-mySim$QTN.position y<-mySim$y #~~~~~~~~~~~~~~~~~~~~PACKAGE~~~~~~~~~~~~~~~~~~~~# #use the GLM GWAS package you developed in homework 4 to perform association analyses with three PCs included as covariates #here are the package parameters #GWASxPCA_GLM <- function(y=NULL, X=NULL, C=NULL, GM=NULL, PCA.M=1:3, QTN.position=NULL, cutoff=NULL){ #install and load "mgcv" #install.packages("mgcv") library(mgcv) ###perform GWAS with Package myGLM <- GWASxPCA_GLM(y = y, X = X, GM=myGM, QTN.position = mySim$QTN.position) ###count number of false positives for identifying half of your QTNs #prepare the data frame to query Pvalue=myGLM$P.value #insert the pvalue Pvalue <- sort(Pvalue, decreasing=FALSE) #create an ordered p value vector SNP <- myGLM$order.SNP #creat a vector with the ordered SNP results <- data.frame(Pvalue, SNP) #create a data frame with the two columns results$IsTrue <- results[,2] %in% mySim$QTN.position #create a new column with true and false depending on QTN QTNs <- as.data.frame(as.numeric(results$IsTrue)) #create a data frame with a column showing presence of QTN with 1 QTNs$sum <- cumsum(QTNs[,1]) #add a new column, and each time a QTN appears in a row add one to the this column ###false positives half <- ceiling(length(QTN.position)/2) #calculate the QTN number that is half the total number of QTN markers <- nrow(QTNs) #3093 for (i in 1:markers){ #create a loop a <- QTNs[i,2] #each time look at the second column of a new row if (a == half){ #determine the integer value for half of the QTN, when we find this QTN we stop and return the row order SNPs <- i break} } SNPs #return the total number of SNP and QTN, including the half mark QTN falsePosi = SNPs - half #subtract the number of QTN from the number of sig SNP to get the number of false positives falsePosi #this is the number of SNP that are more significant than our cutoff QTN, and that are not QTN cutoffQTN = results[SNPs,1] #return the p value for the cutoff QTN ###Manhattan Plot for package color.vector <- rep(rainbow(5), length(unique(myGM[,2]))) plot(t(-log10(myGLM$P.value))~seq(1:m),col=color.vector[myGM[,2]], #basic plot main="GWASxPCA_GLM Manhattan Plot", xlab="SNP Position", ylab="-log10(p)") points(mySim$QTN.position, -log10(myGLM$P.value[mySim$QTN.position]), type="p", pch=20, cex=1,lwd=1.5,col="dimgrey") # colors the QTN points(mySim$QTN.position, -log10(myGLM$P.value[mySim$QTN.position]), type="p", pch=21, cex=2,lwd=1.5,col="dimgrey") abline(h=-log10(cutoffQTN), col="green") #sub -log10(myGLM$cutoff.final) with p value for cutoff QTN to show false positive above the cutoff QTN ######################################################################################## #~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~Question II~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~# ######################################################################################## #Repeat simulation in (1) 30 times and exam statistical power vs. FDR at mapping resolution of 100,000 base pairs (20 points). #use G2P to simulate phenotype #use package to perform GWAS #recover p value and input into power function in GAPIT #generate tables/plots (at resolution of 100kB) to summarize results #~~~~~~~~~~~~~~~~~~~~GAPIT~~~~~~~~~~~~~~~~~~~~# #use GAPIT to perform GWAS, calulate power, type I error, and FDR #first create a new place to store our information GAPPER = "GAPIT" workingD = "~/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5" setwd(workingD) dir.create(file.path(workingD, GAPPER)) setwd(paste(workingD, GAPPER, sep="/")) getwd() setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/GAPIT") #just incase #load the required information source("http://zzlab.net/GAPIT/gapit_functions.txt") source("G2P.R") source("https://bioconductor.org/biocLite.R") biocLite("multtest") #load the required data myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) X=myGD[,-1] #required to generate unique values set.seed(Sys.time()) #prepare the loop to execute the desired tasks in each replication reps=30 power.rep=replicate(reps, { mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.75,NQTN=10,QTNDist="norm") gapit=GAPIT( Y=mySim$Y,GD=myGD, GM=myGM, QTN.position=mySim$QTN.position,PCA.total=3, group.from = 1, group.to = 1,group.by = 10) power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position,GWAS=cbind(myGM, gapit$P)) }) power=power.rep[[2]] # number string 0.1, 0.2 .... 1 #FDR s.fdr=seq(3,length(power.rep),7) fdr=power.rep[s.fdr] # create a table with the FDR values from each rep fdr.mean=Reduce ("+", fdr) / length(fdr) #average across the values using reduce #AUC: power vs. FDR s.auc.fdr=seq(6,length(power.rep),7) #creat a list of number equal to the position of each auc.fdr value auc.fdr=power.rep[s.auc.fdr] #index the output sheet to return the desired values auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr) #average across these values to get the mean #False positive s.falsePos=seq(5, length(power.rep),7) #create a list of numbers equal to the position of each falsePos value falsePos=power.rep[s.falsePos] #index the output sheet to return the target values falsePos.mean=Reduce ("+", falsePos) / length(falsePos) #get the mean by using reduce across the values falsePos_cutoff=falsePos.mean[5] #this is the average number of false positive at the fifth (5/10) QTN #save these values for later GAPIT.fdr.mean=fdr.mean GAPIT.auc.fdr.mean=auc.fdr.mean GAPIT.falsePos.mean=falsePos.mean # Power vs. FDR plot color = rainbow(reps+1) plot(fdr.mean[,1], power, type="b",xlim=c(0,1), col=color[1]) #these are the alpha values for(i in reps:ncol(fdr.mean)){ #change 2 to reps to include all of the lines(fdr[[i]], power, type="b", col= color[i+1]) } ######################################################################################## #~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~Question III~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~# ######################################################################################## #~~~~~~~~~~~~~~~~~~~~A~~~~~~~~~~~~~~~~~~~~# #Repeat (2) with your package replaced by following packages: #PLINK #set working directory setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/PLINK") #load required files myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) #prepare the phenotype file pheno=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.5,NQTN=10,QTNDist="norm") data.Y = data.frame(FID = pheno$Y[,1], SID = pheno$Y[,1], phenotype = pheno$Y[,-1]) write.table(x = data.Y, file = "pheno.txt", row.names = F, quote = F, sep = "\t") #shows up as pheno.txt in WD #prepare the genotypic data and genomic map files write.table(myGD, file = "genotype.txt", row.names=F, quote = F, sep = "\t") write.table(myGM, file = "mapinfo.txt", row.names = F, quote = F, sep = "\t") #check to make sure the files are in the WD dir() #now go to iPAT and run PLINK before proceeding # geno: missing rate, if less than 0.2, filter # maf: minor allel frequency, if less than 0.05, filter #$$$PLINK once$$$# #make sure the file names below match the ones in the wd system("./plink --ped genotype_recode.ped --assoc --map genotype_recode.map --allow-no-sex --adjust --pheno pheno.txt --all-pheno -out outP --geno 0.2 --maf 0.05") output = read.table("Module_1.P1.qassoc", header = T) p=output$P length(p) plinkm<-merge(myGM,output, by= "SNP", all=TRUE) # system("./plink --ped myGD_recode.ped --assoc --map myGD_recode.map --allow-no-sex # --adjust --pheno pheno.txt --all-pheno -out outP --geno 0.2 --maf 0.05") #now repeat from above to calculate power, FDR power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position,GWAS=plinkm[,c(-4:-10)]) PLINKplot <- plot(Power~FDR, data=power, type="b") # Power vs. FDR plot color = rainbow(reps+1) plot(fdr.mean[,1], power, type="b",xlim=c(0,1), col=color[1]) #these are the alpha values for(i in reps:ncol(fdr.mean)){ lines(fdr[[i]], power, type="b", col= color[i+1]) } #$$$PLINK 30 times$$$# #create a new folder and then set the working directory setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/PLINK30") #now go to iPAT and run PLINK before proceeding # geno: missing rate, if less than 0.2, filter # maf: minor allel frequency, if less than 0.05, filter #set the desired number of replications reps=30 #perform the task statRep=replicate(reps, { pheno=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.5,NQTN=10,QTNDist="norm") data.Y = data.frame(FID = pheno$Y[,1], SID = pheno$Y[,1], phenotype = pheno$Y[,-1]) #taxa, taxa, and phenotypic value write.table(x = data.Y, file = "pheno.txt", row.names = F, quote = F, sep = "\t") system("./plink --ped genotype_recode.ped --assoc --map genotype_recode.map --allow-no-sex --adjust --pheno pheno.txt --all-pheno --geno 0.2 --maf 0.05") output = read.table("plink.P1.qassoc", header = T) plink.result = join(myGM,output, by="SNP") power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=pheno$QTN.position, GWAS=cbind(myGM, plink.result$P)) }) power=statRep[[2]] #FDR s.fdr=seq(3,length(statRep),7) fdr=statRep[s.fdr] fdr.mean=Reduce ("+", fdr) / length(fdr) #AUC: power vs. FDR s.auc.fdr=seq(6,length(statRep),7) auc.fdr=statRep[s.auc.fdr] auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr) #Power vs. FDR plot color = rainbow(reps+1) plot(fdr.mean[,1], power, type="b",xlim=c(0,1), col=color[1]) for(i in reps:ncol(fdr.mean)){ lines(fdr[[i]], power, type="b", col= color[i+1]) } #save the values for later PLINK.fdr.mean=fdr.mean PLINK.auc.fdr.mean=auc.fdr.mean #~~~~~~~~~~~~~~~~~~~~B~~~~~~~~~~~~~~~~~~~~# #Repeat (2) with your package replaced by following packages: #FarmCPU #create a folder for FarmCPU and set it as the working directory setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/FarmCPU") # FarmCPU 1 ---- library(bigmemory) library(biganalytics) # require(compiler) #for cmpfun source("http://www.zzlab.net/FarmCPU/FarmCPU_functions.txt")#web source code #load required files myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) #simulate the phenotype using GAPIT mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.75,NQTN=10,QTNDist="norm") # perform GWAS by FarmCPU myFarmCPU= FarmCPU( Y=mySim$Y, GD=myGD, GM=myGM) # Getting p-values myP=as.numeric(myFarmCPU$GWAS$P.value) # Combine gene map file with p value (so we know the location of the markers) myGI.MP=cbind(myGM[,-1],myP) # Create Manhattan plot # The output of this Manhattan plot will go to the folder GAPIT.Manhattan(GI.MP=myGI.MP,seqQTN=mySim$QTN.position) # Create QQ plot # Plot will go to the output folder GAPIT.QQ(myP) power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position, GWAS=cbind(myGM, myP)) farmCPUplot <- plot(Power~FDR, data=power, type="b") ## Replicate farmCPU 30 times #create a new folder and set the WD setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/replicatefarmCPU") getwd() nrep=30 statRep=replicate(nrep, { mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.75,NQTN=10,QTNDist="norm") myFarm=FarmCPU(Y=mySim$Y,GD=myGD,GM=myGM) power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position,GWAS=cbind(myGM, as.numeric(myFarm$GWAS$P.value))) }) power=statRep[[2]] #FDR s.fdr=seq(3,length(statRep),7) fdr=statRep[s.fdr] fdr.mean=Reduce ("+", fdr) / length(fdr) #AUC: power vs. FDR s.auc.fdr=seq(6,length(statRep),7) auc.fdr=statRep[s.auc.fdr] auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr) #save these values for later FarmCPU.fdr.mean=fdr.mean FarmCPU.auc.fdr.mean=auc.fdr.mean color = rainbow(reps+1) plot(fdr.mean[,1], power, type="b",xlim=c(0,1), col=color[1]) for(i in reps:ncol(fdr.mean)){ lines(fdr[[i]], power, type="b", col= color[i+1]) } #~~~~~~~~~~~~~~~~~~~~C~~~~~~~~~~~~~~~~~~~~# #Repeat (2) with your package replaced by following packages: #BLINK #$$$$$BLINK one time$$$$$# #create a folder and set it as the working director setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/BLINK") #make sure the necessary files are present (blink_mac) in the wd dir() myY=mySim$Y colnames(myY)=c("taxa", "SimPheno") myGD1=t(myGD[,-1]) # create file materials write.table(myY,file="myData.txt",quote=F,row.name=F,col.name=T,sep="\t") write.table(myGD1,file="myData.dat",quote=F,row.name=F,col.name=F,sep="\t") write.table(myGM,file="myData.map",quote=F,row.name=F,col.name=T,sep="\t") #check to make sure the correct files are in the wd dir() #run blink system("./blink_mac --file myData --max_loop 10 --numeric --gwas") #Extract p values result.blink= read.table("SimPheno_GWAS_result.txt", head = TRUE) myP=as.numeric(result.blink$p_value) power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position, GWAS=cbind(myGM, myP)) plot(Power~FDR, data=power, type="b") #$$$$$BLINK 30 times$$$$$# #set the wd setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/BLINK") nrep=30 power.rep=replicate(nrep, { mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.75,NQTN=10,QTNDist="norm") myY=mySim$Y colnames(myY)=c("taxa", "SimPheno") write.table(myY,file="myData.txt",quote=F,row.name=F,col.name=T,sep="\t") system("./blink_mac --file myData --max_loop 10 --numeric --gwas") result.blink= read.table("SimPheno_GWAS_result.txt", head = TRUE) power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position, GWAS=cbind(myGM, as.numeric(result.blink$p_value))) }) power=power.rep[[2]] #FDR s.fdr=seq(3,length(power.rep),7) fdr=power.rep[s.fdr] # list of all three FDR fdr.mean=Reduce ("+", fdr) / length(fdr) #AUC: power vs. FDR s.auc.fdr=seq(6,length(power.rep),7) auc.fdr=power.rep[s.auc.fdr] auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr) # Power vs. FDR plot color = rainbow(nrep+1) plot(fdr.mean[,1], power, type="b",xlim=c(0,1), col=color[1]) for(i in nrep:ncol(fdr.mean)){ lines(fdr[[i]], power, type="b", col= color[i+1]) } #save these values for later BLINK.fdr.mean=fdr.mean BLINK.auc.fdr.mean=auc.fdr.mean #$$$$$$summary$$$$$$# GAPIT.fdr.mean=fdr.mean GAPIT.auc.fdr.mean=auc.fdr.mean #0.6404709 GAPIT.falsePos.mean=falsePos.mean FarmCPU.fdr.mean=fdr.mean FarmCPU.auc.fdr.mean=auc.fdr.mean #0.8068327 PLINK.fdr.mean=fdr.mean PLINK.auc.fdr.mean=auc.fdr.mean #0.4830981 BLINK.fdr.mean=fdr.mean BLINK.auc.fdr.mean=auc.fdr.mean #0.8598226 final.power=powerALL #prepare summary plot plot(GAPIT.fdr.mean[,1], final.power, type="b", lty=1, xlim=c(0,1), col="red", xlab="FDR") lines(FarmCPU.fdr.mean[,1], final.power, type = "b", lty=2, col="blue") lines(PLINK.fdr.mean[,1], final.power, type = "b", lty=3, col="green") lines(BLINK.fdr.mean[,1], final.power, type = "b", lty=4, col="black") legend("bottomright", legend=c("GAPIT", "FarmCPU", "PLINK", "BLINK"), col=c("red", "blue", "green","black"), lty=1:4, cex=0.8)