#~~~~~Homework Five~~~~~#
#~~~~~CROPS_545~~~~~#
#~~~~~Evan Craine~~~~~#

#set working directory
  setwd("~/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5")

#read in files
  myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
  myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
  X=myGD[,-1]
  
  n=nrow(X)
  m=ncol(X)
  
  #install the required packages
  install.packages("gplots")
  install.packages("LDheatmap")
  install.packages("genetics")
  install.packages("ape")
  install.packages("EMMREML")
  install.packages("scatterplot3d") 
  
  #load the required packages
  library(multtest)
  library(gplots)
  library(LDheatmap)
  library(genetics)
  library(ape)
  library(EMMREML)
  library(compiler) 
  library(mgcv)
  library(dplyr)
  library(plyr)
  library(scatterplot3d)
  library(MASS)
  library(multtest)
  
  #load the source code
  source("http://zzlab.net/GAPIT/gapit_functions.txt") #make sure compiler is running
  source("http://zzlab.net/GAPIT/emma.txt")
  source("GWASxPCA_GLM.R")
  source("G2P.R")
  source("https://bioconductor.org/biocLite.R")
  biocLite("multtest")
  
  setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5")
  
  

########################################################################################
#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~Question I~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#
########################################################################################
#Sample number of QTNs of your choice from the genetic markers used in homework2 and simulate QTN effects from a standard normal distribution. 
#Assign genetic effects for each individual. 
#Simulate normal distributed residual effects with appropriate variance to have a heritability of 0.75. 
#Add residual effects to the genetic effects to create phenotypes.  

  set.seed(99163)
  mySim<-G2P(X,h2=.75,alpha=1,NQTN=10,distribution="normal")
  QTN.position<-mySim$QTN.position
  y<-mySim$y
  
  #~~~~~~~~~~~~~~~~~~~~PACKAGE~~~~~~~~~~~~~~~~~~~~#
#use the GLM GWAS package you developed in homework 4 to perform association analyses with three PCs included as covariates
  
#here are the package parameters 
#GWASxPCA_GLM <- function(y=NULL, X=NULL, C=NULL, GM=NULL, PCA.M=1:3, QTN.position=NULL, cutoff=NULL){
  
#install and load "mgcv"
#install.packages("mgcv")
  library(mgcv)
  
###perform GWAS with Package
  myGLM <- GWASxPCA_GLM(y = y, X = X, GM=myGM, QTN.position = mySim$QTN.position)
 
###count number of false positives for identifying half of your QTNs 
#prepare the data frame to query
  
  Pvalue=myGLM$P.value #insert the pvalue
  Pvalue <- sort(Pvalue, decreasing=FALSE) #create an ordered p value vector
  SNP <- myGLM$order.SNP #creat a vector with the ordered SNP
  results <- data.frame(Pvalue, SNP) #create a data frame with the two columns
  results$IsTrue <- results[,2] %in% mySim$QTN.position #create a new column with true and false depending on QTN
  
  QTNs <- as.data.frame(as.numeric(results$IsTrue)) #create a data frame with a column showing presence of QTN with 1
  QTNs$sum <- cumsum(QTNs[,1]) #add a new column, and each time a QTN appears in a row add one to the this column
  
  
###false positives
  half <- ceiling(length(QTN.position)/2) #calculate the QTN number that is half the total number of QTN
  markers <- nrow(QTNs) #3093
  
  for (i in 1:markers){ #create a loop
    a <- QTNs[i,2] #each time look at the second column of a new row
    
    if (a == half){ #determine the integer value for half of the QTN, when we find this QTN we stop and return the row order
      SNPs <- i
      break}
  }
  
  SNPs #return the total number of SNP and QTN, including the half mark QTN
  falsePosi = SNPs - half #subtract the number of QTN from the number of sig SNP to get the number of false positives
  falsePosi #this is the number of SNP that are more significant than our cutoff QTN, and that are not QTN
  
  cutoffQTN = results[SNPs,1] #return the p value for the cutoff QTN
  
###Manhattan Plot for package
  color.vector <- rep(rainbow(5), length(unique(myGM[,2])))
  
  plot(t(-log10(myGLM$P.value))~seq(1:m),col=color.vector[myGM[,2]], #basic plot
       main="GWASxPCA_GLM Manhattan Plot", xlab="SNP Position", ylab="-log10(p)") 
  
  points(mySim$QTN.position, -log10(myGLM$P.value[mySim$QTN.position]), type="p", pch=20, cex=1,lwd=1.5,col="dimgrey")  # colors the QTN
  points(mySim$QTN.position, -log10(myGLM$P.value[mySim$QTN.position]), type="p", pch=21, cex=2,lwd=1.5,col="dimgrey")
  
  abline(h=-log10(cutoffQTN), col="green") #sub -log10(myGLM$cutoff.final) with p value for cutoff QTN to show false positive above the cutoff QTN
  
 
########################################################################################
#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~Question II~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#
########################################################################################
#Repeat simulation in (1) 30 times and exam statistical power vs. FDR at mapping resolution of 100,000 base pairs (20 points). 
#use G2P to simulate phenotype
#use package to perform GWAS
#recover p value and input into power function in GAPIT
#generate tables/plots (at resolution of 100kB) to summarize results
  
#~~~~~~~~~~~~~~~~~~~~GAPIT~~~~~~~~~~~~~~~~~~~~#
#use GAPIT to perform GWAS, calulate power, type I error, and FDR
#first create a new place to store our information
  GAPPER = "GAPIT"
  workingD = "~/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5"
  setwd(workingD)
  dir.create(file.path(workingD, GAPPER))
  setwd(paste(workingD, GAPPER, sep="/"))
  getwd() 
  setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/GAPIT") #just incase
  
#load the required information
  source("http://zzlab.net/GAPIT/gapit_functions.txt")
  source("G2P.R")
  source("https://bioconductor.org/biocLite.R")
  biocLite("multtest")
  
#load the required data
  myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
  myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
  X=myGD[,-1]
  
#required to generate unique values
  set.seed(Sys.time()) 
  
#prepare the loop to execute the desired tasks in each replication
  reps=30
  
  power.rep=replicate(reps, {
    mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.75,NQTN=10,QTNDist="norm")
    gapit=GAPIT(
      Y=mySim$Y,GD=myGD, GM=myGM,
      QTN.position=mySim$QTN.position,PCA.total=3, 
      group.from = 1, group.to = 1,group.by = 10)
    power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position,GWAS=cbind(myGM, gapit$P))
  })
  power=power.rep[[2]] # number string 0.1, 0.2 .... 1
  
#FDR
  s.fdr=seq(3,length(power.rep),7)
  fdr=power.rep[s.fdr] # create a table with the FDR values from each rep
  fdr.mean=Reduce ("+", fdr) / length(fdr) #average across the values using reduce
  
#AUC: power vs. FDR
  s.auc.fdr=seq(6,length(power.rep),7) #creat a list of number equal to the position of each auc.fdr value
  auc.fdr=power.rep[s.auc.fdr] #index the output sheet to return the desired values
  auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr) #average across these values to get the mean
  
#False positive
  s.falsePos=seq(5, length(power.rep),7) #create a list of numbers equal to the position of each falsePos value
  falsePos=power.rep[s.falsePos] #index the output sheet to return the target values 
  falsePos.mean=Reduce ("+", falsePos) / length(falsePos) #get the mean by using reduce across the values 
  falsePos_cutoff=falsePos.mean[5] #this is the average number of false positive at the fifth (5/10) QTN

#save these values for later
  GAPIT.fdr.mean=fdr.mean
  GAPIT.auc.fdr.mean=auc.fdr.mean
  GAPIT.falsePos.mean=falsePos.mean
  
  # Power vs. FDR plot
  color = rainbow(reps+1)
  plot(fdr.mean[,1], power, type="b",xlim=c(0,1), col=color[1]) #these are the alpha values
  for(i in reps:ncol(fdr.mean)){ #change 2 to reps to include all of the 
    lines(fdr[[i]], power, type="b", col= color[i+1])
  }
  
########################################################################################
#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~Question III~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#~~~~~#
########################################################################################
  
  #~~~~~~~~~~~~~~~~~~~~A~~~~~~~~~~~~~~~~~~~~#
#Repeat (2) with your package replaced by following packages: 
  #PLINK
  
#set working directory 
  setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/PLINK")
  
#load required files
  myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
  myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
  
#prepare the phenotype file
  pheno=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.5,NQTN=10,QTNDist="norm")
  data.Y = data.frame(FID = pheno$Y[,1], SID = pheno$Y[,1], phenotype = pheno$Y[,-1])
  write.table(x = data.Y, file = "pheno.txt", row.names = F, quote = F, sep = "\t") #shows up as pheno.txt in WD
  
#prepare the genotypic data and genomic map files
  write.table(myGD, file = "genotype.txt", row.names=F, quote = F, sep = "\t")
  write.table(myGM, file = "mapinfo.txt", row.names = F, quote = F, sep = "\t")
  
#check to make sure the files are in the WD
  dir()
  
#now go to iPAT and run PLINK before proceeding
  # geno: missing rate, if less than 0.2, filter
  # maf: minor allel frequency, if less than 0.05, filter

#$$$PLINK once$$$#
#make sure the file names below match the ones in the wd
  system("./plink --ped genotype_recode.ped --assoc --map genotype_recode.map --allow-no-sex --adjust --pheno pheno.txt --all-pheno -out outP --geno 0.2 --maf 0.05")
  output = read.table("Module_1.P1.qassoc", header = T)
  p=output$P
  length(p)
  plinkm<-merge(myGM,output, by= "SNP", all=TRUE)
  # system("./plink --ped myGD_recode.ped --assoc --map myGD_recode.map --allow-no-sex
  #       --adjust --pheno pheno.txt --all-pheno -out outP --geno 0.2 --maf 0.05")
  
#now repeat from above to calculate power, FDR
  power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position,GWAS=plinkm[,c(-4:-10)])
  
  PLINKplot <- plot(Power~FDR, data=power, type="b")
  
  
  # Power vs. FDR plot
  color = rainbow(reps+1)
  plot(fdr.mean[,1], power, type="b",xlim=c(0,1), col=color[1]) #these are the alpha values
  for(i in reps:ncol(fdr.mean)){
    lines(fdr[[i]], power, type="b", col= color[i+1])
  }
  
#$$$PLINK 30 times$$$#  

#create a new folder and then set the working directory
  
  setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/PLINK30")

#now go to iPAT and run PLINK before proceeding
# geno: missing rate, if less than 0.2, filter
# maf: minor allel frequency, if less than 0.05, filter
  
#set the desired number of replications  
  reps=30
  
#perform the task  

  statRep=replicate(reps, {
    pheno=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.5,NQTN=10,QTNDist="norm")
    data.Y = data.frame(FID = pheno$Y[,1], SID = pheno$Y[,1], phenotype = pheno$Y[,-1]) #taxa, taxa, and phenotypic value
    write.table(x = data.Y, file = "pheno.txt", row.names = F, quote = F, sep = "\t")
    system("./plink --ped genotype_recode.ped --assoc --map genotype_recode.map --allow-no-sex --adjust --pheno pheno.txt --all-pheno --geno 0.2 --maf 0.05")
    output = read.table("plink.P1.qassoc", header = T)
    plink.result = join(myGM,output, by="SNP")
    power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=pheno$QTN.position,
                          GWAS=cbind(myGM, plink.result$P))
  })
  
  power=statRep[[2]]
  
#FDR
  s.fdr=seq(3,length(statRep),7)
  fdr=statRep[s.fdr]
  fdr.mean=Reduce ("+", fdr) / length(fdr)
  
#AUC: power vs. FDR
  s.auc.fdr=seq(6,length(statRep),7)
  auc.fdr=statRep[s.auc.fdr]
  auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr)
  

#Power vs. FDR plot
  color = rainbow(reps+1)
  plot(fdr.mean[,1], power, type="b",xlim=c(0,1), col=color[1])
  for(i in reps:ncol(fdr.mean)){
    lines(fdr[[i]], power, type="b", col= color[i+1])
  }
  
#save the values for later
  PLINK.fdr.mean=fdr.mean
  PLINK.auc.fdr.mean=auc.fdr.mean

  #~~~~~~~~~~~~~~~~~~~~B~~~~~~~~~~~~~~~~~~~~#
#Repeat (2) with your package replaced by following packages: 
  #FarmCPU
#create a folder for FarmCPU and set it as the working directory 
  setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/FarmCPU")

# FarmCPU 1 ----
  library(bigmemory)
  library(biganalytics)
  # require(compiler) #for cmpfun
  source("http://www.zzlab.net/FarmCPU/FarmCPU_functions.txt")#web source code
  #load required files
  myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
  myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
  
  
#simulate the phenotype using GAPIT
  mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.75,NQTN=10,QTNDist="norm")
  
  # perform GWAS by FarmCPU
  myFarmCPU= FarmCPU(
    Y=mySim$Y,
    GD=myGD,
    GM=myGM) 
  # Getting p-values
  myP=as.numeric(myFarmCPU$GWAS$P.value)
  # Combine gene map file with p value (so we know the location of the markers)
  myGI.MP=cbind(myGM[,-1],myP)
  # Create Manhattan plot
  # The output of this Manhattan plot will go to the folder
  GAPIT.Manhattan(GI.MP=myGI.MP,seqQTN=mySim$QTN.position)
  # Create QQ plot
  # Plot will go to the output folder
  GAPIT.QQ(myP)
  
  power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position,
                        GWAS=cbind(myGM, myP))
  farmCPUplot <- plot(Power~FDR, data=power, type="b")
  
  
  
  ##  Replicate farmCPU 30 times
#create a new folder and set the WD
  setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/replicatefarmCPU")  
  getwd()
  
  nrep=30
  statRep=replicate(nrep, {
    mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.75,NQTN=10,QTNDist="norm")
    myFarm=FarmCPU(Y=mySim$Y,GD=myGD,GM=myGM)
    power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position,GWAS=cbind(myGM, as.numeric(myFarm$GWAS$P.value)))
  })
  
  
  power=statRep[[2]]
  
#FDR
  s.fdr=seq(3,length(statRep),7)
  fdr=statRep[s.fdr]
  fdr.mean=Reduce ("+", fdr) / length(fdr)
  
#AUC: power vs. FDR
  s.auc.fdr=seq(6,length(statRep),7)
  auc.fdr=statRep[s.auc.fdr]
  auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr)
  
#save these values for later  
  FarmCPU.fdr.mean=fdr.mean
  FarmCPU.auc.fdr.mean=auc.fdr.mean
  
  color = rainbow(reps+1)
  plot(fdr.mean[,1], power, type="b",xlim=c(0,1), col=color[1])
  for(i in reps:ncol(fdr.mean)){
    lines(fdr[[i]], power, type="b", col= color[i+1])
  }

  
  #~~~~~~~~~~~~~~~~~~~~C~~~~~~~~~~~~~~~~~~~~#
#Repeat (2) with your package replaced by following packages: 
  #BLINK
  
#$$$$$BLINK one time$$$$$#
  
#create a folder and set it as the working director
  setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/BLINK")
  
#make sure the necessary files are present (blink_mac) in the wd
  dir()
  
  myY=mySim$Y
  colnames(myY)=c("taxa", "SimPheno")
  myGD1=t(myGD[,-1])
  # create file materials
  write.table(myY,file="myData.txt",quote=F,row.name=F,col.name=T,sep="\t")
  write.table(myGD1,file="myData.dat",quote=F,row.name=F,col.name=F,sep="\t")
  write.table(myGM,file="myData.map",quote=F,row.name=F,col.name=T,sep="\t")

#check to make sure the correct files are in the wd
  dir()
  
#run blink 
  system("./blink_mac --file myData --max_loop 10 --numeric --gwas")

  
#Extract p values
  result.blink= read.table("SimPheno_GWAS_result.txt", head = TRUE)
  myP=as.numeric(result.blink$p_value)
  power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position,
                        GWAS=cbind(myGM, myP))
  plot(Power~FDR, data=power, type="b")
  
  
#$$$$$BLINK 30 times$$$$$#

#set the wd
  setwd("~/evancraine@gmail_DROPBOX/Dropbox/evan.craine@wsu.edu/a_Spring_2018/CROPS_545/HW/HW5/BLINK")
  
  
  nrep=30
  
  power.rep=replicate(nrep, {
    mySim=GAPIT.Phenotype.Simulation(GD=myGD,GM=myGM,h2=.75,NQTN=10,QTNDist="norm")
    myY=mySim$Y
    colnames(myY)=c("taxa", "SimPheno")
    write.table(myY,file="myData.txt",quote=F,row.name=F,col.name=T,sep="\t")
    system("./blink_mac --file myData --max_loop 10 --numeric --gwas")
    result.blink= read.table("SimPheno_GWAS_result.txt", head = TRUE)
    power=GAPIT.FDR.TypeI(WS=1e5,GM=myGM,seqQTN=mySim$QTN.position,
                          GWAS=cbind(myGM, as.numeric(result.blink$p_value)))
  })
  
  power=power.rep[[2]]
  #FDR
  s.fdr=seq(3,length(power.rep),7)
  fdr=power.rep[s.fdr] # list of all three FDR
  fdr.mean=Reduce ("+", fdr) / length(fdr)
  
  #AUC: power vs. FDR
  s.auc.fdr=seq(6,length(power.rep),7)
  auc.fdr=power.rep[s.auc.fdr]
  auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr)
  
  # Power vs. FDR plot
  color = rainbow(nrep+1)
  plot(fdr.mean[,1], power, type="b",xlim=c(0,1), col=color[1])
  for(i in nrep:ncol(fdr.mean)){
    lines(fdr[[i]], power, type="b", col= color[i+1])
  }
  
#save these values for later
  BLINK.fdr.mean=fdr.mean
  BLINK.auc.fdr.mean=auc.fdr.mean

  
#$$$$$$summary$$$$$$#

  GAPIT.fdr.mean=fdr.mean
  GAPIT.auc.fdr.mean=auc.fdr.mean #0.6404709
  GAPIT.falsePos.mean=falsePos.mean
  
  FarmCPU.fdr.mean=fdr.mean
  FarmCPU.auc.fdr.mean=auc.fdr.mean #0.8068327
  
  PLINK.fdr.mean=fdr.mean
  PLINK.auc.fdr.mean=auc.fdr.mean #0.4830981
  
  BLINK.fdr.mean=fdr.mean
  BLINK.auc.fdr.mean=auc.fdr.mean #0.8598226
  
  final.power=powerALL

#prepare summary plot  
  plot(GAPIT.fdr.mean[,1], final.power, type="b", lty=1, xlim=c(0,1), col="red", xlab="FDR")
  lines(FarmCPU.fdr.mean[,1], final.power, type = "b", lty=2, col="blue")
  lines(PLINK.fdr.mean[,1], final.power, type = "b", lty=3, col="green")
  lines(BLINK.fdr.mean[,1], final.power, type = "b", lty=4, col="black")
  
  legend("bottomright", legend=c("GAPIT", "FarmCPU", "PLINK", "BLINK"),
         col=c("red", "blue", "green","black"), lty=1:4, cex=0.8)