# Crop 545 - lab 8
# Power and FDR
# Y.Song Feb 21, 2018

# 000000000000000000000000000000000000000000000
#
#      0. Set things up ----
#
# 000000000000000000000000000000000000000000000

rm(list=ls()) # remove any object in the 'Environment'
# setwd( ) # Please set working directory as need

# The compiler library is required for the cmpfun function
# It should be already installed. If not, please run the next line
# install.package('compiler') 
library(compiler) #required for cmpfun

# functions will be used for this lab:
source("G2P.R")
source("GWASbyCor.R")

source("http://www.zzlab.net/GAPIT/gapit_functions.txt")
# the gapit_functions is a group of functions.
# You've been familiar with loading one single function in a file
# Notice that this time we sourced the gapit function group, and
# you'll see many functions loaded this time.


# the example datast 
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)


# 000000000000000000000000000000000000000000000
#
#      1. G2P simulation & GWAS by Corr ----
#
# 000000000000000000000000000000000000000000000

# Before we can identify any false positive and perform any power 
# analysis, we will first simulate a phenometric vector and perform a
# GWAS by Correlation analysis, so that we can have something to 
# analysis.

# 1.1 G2P phenotype simulation ------------------------------------
X=myGD[,-1]
index1to5=myGM[,2]<6
X1to5 = X[,index1to5]
set.seed(99164)
mySim=G2P(X= X1to5,h2=.75,alpha=1,NQTN=10,distribution="norm")
# notice that we only simulated with QNT's on 1st-5th chromosome

# 1.2 GWAS by Corr ------------------------------------
p= GWASbyCor(X=X,y=mySim$y) # ignore the warning
round(p[1:5], 4) 
# ?round()


# 000000000000000000000000000000000000000000000
#
# 2. True-positive & false-positive ----
#       (T-pos)          (F-pos)
#
# 000000000000000000000000000000000000000000000

index=order(p) 
# rank of each p value in ascending order. Please notice order is 
# different from sort()
# ?order()
# Example:  lala=c(4,2,6,3,7); order(lala)

top5=index[1:5] # location of the five most significant
top5
p[top5] # p-value of the 5 most significant QTN's

# 2.1 find T-pos and F-pos ------------------------------------
# when simulate the phenotype, we know we only assigned 10 QTN's to
# contribute to our phenotype (if you are not clear how this was done,
# please review lecture/lab for phenotype simulation). We know where
# those QTN are (in mySim$QTN.position)
# Thus, if the significant QNT's are the ones we assign, it is true-positive.
# Otherwise, the significant ones are false-positive
detected=intersect(top5,mySim$QTN.position)
# ?intersect()
detected
# Is detected true-positive or false-positive

falsePositive=setdiff(top5, mySim$QTN.position)
falsePositive

# plot the two situation:
color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=1, col = "black")
abline(v= falsePositive, lty = 2, lwd=2, col = "red")

# 2.2 p.null ------------------------------------
# some locations we recieved NA values for p-value. First, we want to know 
# how many NA p values we got
sum(is.na(p)) 
# recall: TRUE=1 and FALSE=0, and we use sum() to count the number of NA valus
# count the number of certain even is a common task in data analysis
# there're other ways to do so.

# Our null hypothesis - a QTN is not correlated with the phenotype.
index.null=!index1to5 & !is.na(p)
# index.null = on chromosome 1-6 and the p value is not NA
p.null=p[index.null]
# get p values for the index.null condition
m.null=length(p.null)

index.sort=order(p.null) 
# record the rank of p.null
p.null.sort=p.null[index.sort]
# note: the above two command is equivelent to
# p.null.sort = sort(p.null)

head(p.null.sort) # smallest p-values
tail(p.null.sort) # lagest p-values
seq=seq(1:m.null) 
table=data.frame(seq, p.null.sort, seq/m.null) 
# third parameter (seq/m.null) is a ratio, similar to the idea of percentile
# equivelent to (current rank)/(total length of array)
# put the rank, p.null.sort, and seq.m.null into a dataframe table.
head(table,5)
tail(table,5)
plot(table$seq, table$p.null.sort, xlab = "Rank", ylab = "p-value")
# the lower the rank is, the smaller the p-value is

# 000000000000000000000000000000000000000000000
#
#          3. GAPIT for FDR and Power ----
#
# 000000000000000000000000000000000000000000000

# 3.1 QTN Bins ------------------------------------
# First we organize the result above and prepare it in the format for GAPIT function

# we will code the location of QTN into bins
bigNum=1e9;  resolution=1e5
bin=round((myGM$Chromosome*bigNum+myGM$Position)/resolution)
# QTN's close to each other will be put in the same bin
result=cbind(myGM,t(p),bin)
head(result) 
# notice the second to fourth QNT's are all in bin 10029
# because they are close 

# Get the bin information for the QTN's we used for phenotype
# simulation
QTN.bin=result[mySim$QTN.position,]
QTN.bin

# sort QTN.bin dataset by t(p) value
index.qtn.p=order(QTN.bin$`t(p)` ) 
# notive the 't(p)' colun is quoted in the `` sign (the one on the same key as the ~ sign)
QTN.bin[index.qtn.p,]

# 3.2 GAPIT FDR TypeI ------------------------------------
# We will use GAPIT.FDR.TypeI function to do the task.
myStat=GAPIT.FDR.TypeI(
  WS=c(1e0,1e3,1e4,1e5), # WS = window size
  GM=myGM,  # dataset
  seqQTN=mySim$QTN.position,  # QTN position
  GWAS=result # result[SNPm Chromosome, Position, t(p), bin]. See Sec.3 QTN Bins
  )
str(myStat)
myStat$FDR 
# four columns corresponding to four wondow sizes
# ten rows corresponding to ten QTN positions

par(mfrow=c(1,2),mar = c(5,4,3,2))
plot(myStat$FDR[,1],myStat$Power,type="b", xlab="FDR", ylab="Power")
plot(myStat$TypeI[,1],myStat$Power,type="b", xlab="TypeI error", ylab="Power")

# 000000000000000000000000000000000000000000000
#
#          4. Repeat GAPIT.FDR.TYPEI 1. ----
#
# 000000000000000000000000000000000000000000000

# Now, we will repeat the process 100 times
# In each loop, we start from G2P simulation, and ends at calculating the TypeI error
nrep=100
set.seed(99164)
statRep=replicate(nrep, {
  mySim=G2P(X=myGD[,-1],h2=.75,alpha=1,NQTN=10,distribution="norm")
  p=p= GWASbyCor(X=myGD[,-1],y=mySim$y)
  seqQTN=mySim$QTN.position
  myGWAS=cbind(myGM,t(p),NA)
  myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), 
                         GM=myGM,seqQTN=mySim$QTN.position,
                         GWAS=myGWAS,maxOut=100,MaxBP=1e10)
})
# ignore the error

str(statRep) 
# Notice: we asked to save everything in the statRep, thus statRep is a very big list. 
# But it is still a list, we can use index to get the result we are interested in.
power=statRep[[2]]# the second list member
power

# 4.1 Results as number scores -----------------------------------
s.fdr=seq(3,length(statRep),7) 
# making a number sequence starting from 3 to the number of the length of statRep, with an 
# interval of 7. The s.fdr is a index
fdr=statRep[s.fdr]
# The result is 100 fdr sub-lists. Each sub-list corresponding to one replicate in the
# statRep loop (beginning of sec.4)
# Observe the structure the Sec.3.2 myStat, make sure you understand how we obtain all fdr
fdr.mean=Reduce ("+", fdr) / length(fdr)
# Use 'Reduce()' function to perform a element wise summation
# ?Reduce
# Example:
#    add <- function(x) Reduce("+", x)
#    add(list(1, 2, 3))

# similarly, get typeI error rate
s.tp1=seq(4,length(statRep),7)
tp1=statRep[s.tp1]
tp1.mean=Reduce ("+", tp1) / length(tp1)

# AUC (area under curve)
s.auc.fdr=seq(6,length(statRep),7)
auc.fdr=statRep[s.auc.fdr]
auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr)


# 4.2 Results as plots ------------------------------ 
# ROC curve (Receiver Operating Characteristic)
theColor=rainbow(4)
plot(fdr.mean[,1],power , type="b", col=theColor [1],xlim=c(0,1))
for(i in 2:ncol(fdr.mean)){
  lines(fdr.mean[,i], power , type="b", col= theColor [i])
}

# barplot on AUC
barplot(auc.fdr.mean, 
        names.arg=c("1bp", "1K", "10K","100K"), 
        xlab="Resolution",
        ylab="AUC")


# 000000000000000000000000000000000000000000000
#
#    5. Repeat with varied Heritability ----
#
# 000000000000000000000000000000000000000000000

nrep=100
set.seed(99164)

#h2=25%
statRep25=replicate(nrep, {
  mySim=G2P(X=myGD[,-1],h2=.25,alpha=1,NQTN=10,distribution="norm")
  p=p= GWASbyCor(X=myGD[,-1],y=mySim$y)
  seqQTN=mySim$QTN.position
  myGWAS=cbind(myGM,t(p),NA)
  myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position,GWAS=myGWAS,maxOut=100,MaxBP=1e10)})

#h2=75%
statRep75=replicate(nrep, {
  mySim=G2P(X=myGD[,-1],h2=.75,alpha=1,NQTN=10,distribution="norm")
  p=p= GWASbyCor(X=myGD[,-1],y=mySim$y)
  seqQTN=mySim$QTN.position
  myGWAS=cbind(myGM,t(p),NA)
  myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position,GWAS=myGWAS,maxOut=100,MaxBP=1e10)})

power25=statRep25[[2]]
s.t1=seq(4,length(statRep25),7)
t1=statRep25[s.t1]
t1.mean.25=Reduce ("+", t1) / length(t1)

power75=statRep75[[2]]
s.t1=seq(4,length(statRep75),7)
t1=statRep75[s.t1]
t1.mean.75=Reduce ("+", t1) / length(t1)

plot(t1.mean.25[,4],power25, type="b", col="blue",xlim=c(0,1))
lines(t1.mean.75[,4], power75, type="b", col= "red")