QTNs 0n CHR 1-5, leave 6-10 empty

# Load data
myGD = read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt", head = T)
myGM = read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt", head = T)
source("http://zzlab.net/StaGen/2020/R/G2P.R")
source("http://zzlab.net/StaGen/2020/R/GWASbyCor.R")
# Subset the genetic data (myGD) by removing the first column
X = myGD[, -1]
# Create a logical index based on the chromosome number
index1to5 = myGM[,2] < 6
# This is used to select first half genome (chr 1 to 5) from X
X1to5 = X[,index1to5]
# Set the random seed for reproducibility, so the results are the same each time
set.seed(99164)
# Simulate phenotype using the G2P function:
mySim = G2P(X = X1to5, h2 = .75, alpha = 1, NQTN = 10, distribution = "norm")
# GWAS by correlation
p = GWASbyCor(X = X, y = mySim$y) 

# Manhattan plot
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
# number of SNPs
m = nrow(myGM) 
# Plot -log10(p) against SNPs
plot(t(-log10(p)) ~ seq(1:m), col = color.vector[myGM[, 2]]) # Color the points based on chromosome number

# True QTN position
abline(v = mySim$QTN.position, lty = 2, lwd = 2, col = "gray")

abline(h = -log10(0.01), lty = 1, lwd = 2, col = "red")

# Bonferroni correction
P = 0.01 / 3093
-log10(P)
## [1] 5.49038
# Add a horizontal line at 
abline(h = -log10(P), lty = 1, lwd = 2, col = "red")

P value observed and expected (null)

p.obs=p
m=length(p.obs)
hist(p.obs)

p.uni=runif(length(p.obs),0,1)
hist(p.uni)

p.exp=seq(m)/m
hist(p.exp)

Comparison between the two ways of expected (null)

order.uni=order(p.uni)
ps.uni=p.uni[order.uni]

plot(p.exp,ps.uni)

plot(-log10(p.exp),-log10(ps.uni))

Observed against expected: QQ plot

order.obs=order(p.obs)
ps.obs=p.obs[order.obs]
plot(-log10(p.exp),-log10(ps.obs))
abline(a = 0, b = 1, col = "red")

QQ plot on Non-QTN SNPs

p.obs=p[-mySim$QTN.positio]
m=length(p.obs)
p.exp=seq(m)/m

order.obs=order(p.obs)
ps.obs=p.obs[order.obs]
plot(-log10(p.exp),-log10(ps.obs)) 
abline(a = 0, b = 1, col = "red")

QQ plot on null SNPs

p.obs=p[!index1to5]
m=length(p.obs)
p.exp=seq(m)/m

order.obs=order(p.obs)
ps.obs=p.obs[order.obs]
plot(-log10(p.exp),-log10(ps.obs)) 
abline(a = 0, b = 1, col = "red")

Cutoff (Exact))

type1=c(0.01, 0.05, 0.1, 0.2)
cutoff=quantile(p.obs,type1,na.rm=T)
cutoff
##         1%         5%        10%        20% 
## 0.00212527 0.01637685 0.05601180 0.13322666
plot(type1, cutoff,type="b")

-log10(cutoff[1])  # Single test threshold
##       1% 
## 2.672586
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(h=-log10(cutoff[1]) , lty = 2, lwd=2, col = "red")

Multiple test correction: Bonferroni

Single test threshold: 𝛼=0.01 Number of test: m=3093 Threshold: 𝛼/𝑚= 3.23311E-06, or -log10(3.23311E-06)=5.49

# Bonferroni Multiple test threshold
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(h=-log10(0.01/m), lty = 2, lwd=2, col = "red")

Bonferroni threshold with inflation correction

Single test threshold: 𝛼=“0.00212527” Number of test: m=3093 Threshold: 𝛼/𝑚=6.871225E-07, or -log10(6.871225E-07)=6.16

color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(h=-log10(cutoff[1]/m), lty = 2, lwd=2, col = "red")

# Bonferroni multiple test threshold with inflation correction

Problems of Bonferroni Threshold

Bonferroni threshold leads to over correction if multiple tests are not independent, e.g. dense markers or long LD extent.

Empirical threshold
1. Randomly shuffle observed phenotypes to break the linkage with genotypes 2. Conduct GWAS on the shuffled phenotypes and record the most significant P value 3. Repeat 1-2 n (e.g. 1,000) times to get the distribution of the minimum P values 4. Use the 1% percentile as the threshold

set.seed(99164)
nRep=1000
P.array=matrix(NA, 1,nRep)
n=length(mySim$y)

for(i in c(1:nRep)){
theY=mySim$y[sample(n,n)]
p= GWASbyCor(X=X,y=theY)
theMinP=min(p,na.rm=T)
P.array[i]= theMinP
}
plot(density(-log10(P.array))) 

plot(ecdf(-log10(P.array)))

th <- quantile(-log10(P.array),.99)
th
##      99% 
## 5.357958
set.seed(99164)
# Simulate phenotype using the G2P function:
mySim = G2P(X = X1to5, h2 = .75, alpha = 1, NQTN = 10, distribution = "norm")
# GWAS by correlation
p = GWASbyCor(X = X, y = mySim$y) 
# Empirical multiple test threshold, No inflation correction
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(h=th, lty = 2, lwd=2, col = "red")

# Empirical multiple test threshold with inflation correction
th <- quantile(-log10(P.array),1-cutoff[1])
th
## 99.78747% 
##  5.995218
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(h=th, lty = 2, lwd=2, col = "red")

Resolution and bin approach

bigNum=1e9
resolution=100000
bin=round((myGM[,2]*bigNum+myGM[,3])/resolution)
result=cbind(myGM,t(p),bin)
head(result)
##                   SNP Chromosome Position       t(p)   bin
## PZB00859.1 PZB00859.1          1   157104 0.93326725 10002
## PZA01271.1 PZA01271.1          1  1947984 0.72817662 10019
## PZA03613.2 PZA03613.2          1  2914066 0.81923517 10029
## PZA03613.1 PZA03613.1          1  2914171 0.56670871 10029
## PZA03614.2 PZA03614.2          1  2915078 0.07868706 10029
## PZA03614.1 PZA03614.1          1  2915242 0.14958491 10029
tail(result)
##                   SNP Chromosome  Position       t(p)    bin
## PZA00678.2 PZA00678.2         10 147762964 0.44774328 101478
## PZA00179.9 PZA00179.9         10 147819958 0.49223448 101478
## PZA02833.6 PZA02833.6         10 148332885 0.57258347 101483
## PHM1506.23 PHM1506.23         10 148488692 0.09820785 101485
## PZA03402.1 PZA03402.1         10 148816538 0.20554130 101488
## PZA03429.1 PZA03429.1         10 148907116 0.18355498 101489
# Bins of QTNs
QTN.bin=result[mySim$QTN.position,]
QTN.bin
##                     SNP Chromosome  Position         t(p)   bin
## PZA03457.1   PZA03457.1          1 262715518 1.898106e-04 12627
## PZB01233.6   PZB01233.6          2   3376157 5.961492e-02 20034
## PZA00610.15 PZA00610.15          1 280215046 1.985976e-01 12802
## PZA00527.10 PZA00527.10          2 216833071 2.569003e-10 22168
## PZA00709.19 PZA00709.19          1 261859081 1.459755e-05 12619
## PZA00367.6   PZA00367.6          2 161879515 1.052209e-03 21619
## PZA03647.1   PZA03647.1          3 185318330 2.324493e-09 31853
## PZA02742.1   PZA02742.1          3  97441783 2.012597e-08 30974
## PZA03121.2   PZA03121.2          2  29977973 3.009048e-05 20300
## PZA01902.1   PZA01902.1          2  89520665 8.091195e-07 20895
# Sort bins of QTNs
index.qtn.p=order(QTN.bin[,4])
QTN.bin[index.qtn.p,]
##                     SNP Chromosome  Position         t(p)   bin
## PZA00527.10 PZA00527.10          2 216833071 2.569003e-10 22168
## PZA03647.1   PZA03647.1          3 185318330 2.324493e-09 31853
## PZA02742.1   PZA02742.1          3  97441783 2.012597e-08 30974
## PZA01902.1   PZA01902.1          2  89520665 8.091195e-07 20895
## PZA00709.19 PZA00709.19          1 261859081 1.459755e-05 12619
## PZA03121.2   PZA03121.2          2  29977973 3.009048e-05 20300
## PZA03457.1   PZA03457.1          1 262715518 1.898106e-04 12627
## PZA00367.6   PZA00367.6          2 161879515 1.052209e-03 21619
## PZB01233.6   PZB01233.6          2   3376157 5.961492e-02 20034
## PZA00610.15 PZA00610.15          1 280215046 1.985976e-01 12802

FDR and type I error

# Load data
myGD = read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt", head = T)
myGM = read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt", head = T)
source("http://zzlab.net/StaGen/2020/R/G2P.R")
source("http://zzlab.net/StaGen/2020/R/GWASbyCor.R")
set.seed(99163)
mySim = G2P(X = myGD[, -1], h2 = .75, alpha = 1, NQTN = 10, distribution = "normal")
p = GWASbyCor(X = X, y = mySim$y) 

# Manhattan plot
color.vector <- rep(c("deepskyblue", "orange", "forestgreen", "indianred3"), 10)
# number of SNPs
m = nrow(myGM) 
# Plot -log10(p) against SNPs
plot(t(-log10(p)) ~ seq(1:m), col = color.vector[myGM[, 2]]) # Color the points based on chromosome number
# True QTN position
abline(v = mySim$QTN.position, lty = 2, lwd = 2, col = "gray")

FDR and type I error

source("http://zzlab.net/GAPIT/gapit_functions.txt")
set.seed(99164)
myStat=GAPIT.FDR.TypeI(
# WS- window size
#WS=c(1e0,1e3,1e4,1e5), 
WS=1e5, 
#Input: GM - m by 3  matrix for SNP name, chromosome and BP
GM=myGM,
#Input: seqQTN - s by 1 vecter for index of QTN on GM (+1 for GDP column wise)
seqQTN=mySim$QTN.position,
#Input: GWAS - SNP,CHR,BP,P,MAF
GWAS=result)

str(myStat)
## List of 7
##  $ P      : num [1, 1:10] 0.0172 0.0243 0.0281 0.0424 0.0456 ...
##  $ Power  : num [1:10] 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
##  $ FDR    : num [1:10, 1] 0.994 0.99 0.986 0.985 0.982 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:10] "fdr.record" "final.fdr" "final.fdr" "final.fdr" ...
##   .. ..$ : NULL
##  $ TypeI  : num [1:10, 1] 0.123 0.145 0.156 0.193 0.201 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:10] "t1.record" "final.t1" "final.t1" "final.t1" ...
##   .. ..$ : NULL
##  $ False  : int [1:10, 1] 168 198 213 264 274 495 580 987 1052 1171
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:10] "number.record" "final" "final" "final" ...
##   .. ..$ : NULL
##  $ AUC.FDR: num 0.0104
##  $ AUC.T1 : num 0.553
WS=1e5
GWAS=result
seqQTN=mySim$QTN.position
MaxBP=1e10
total.index=1:nrow(myGM)
#result = GWAS
qtn.pool=ceiling((as.numeric(GWAS[seqQTN,2])*MaxBP+as.numeric(GWAS[seqQTN,3]))/(2*WS))
bonf.pool=ceiling((as.numeric(GWAS[total.index,2])*MaxBP+as.numeric(GWAS[total.index,3]))/(2*WS))
false.number=length(levels(factor(bonf.pool[!(bonf.pool%in%qtn.pool)])))

# Create a proper data frame
stat <- data.frame(
     bin = qtn.pool,
     P = t(myStat$P)[, 1],  
     Power = myStat$Power,
     False = myStat$False[, 1],  # Extract the first column from False
     FDR = myStat$FDR[, 1],  # Extract the first column from FDR
     TypeI = myStat$TypeI[, 1]  # Extract the first column from TypeI
 )

# Check the result
print(stat)
##       bin          P Power False       FDR     TypeI
## 1   50223 0.01720321   0.1   168 0.9940828 0.1230769
## 2  200093 0.02434129   0.2   198 0.9900000 0.1450549
## 3  350376 0.02811636   0.3   213 0.9861111 0.1560440
## 4  200367 0.04235257   0.4   264 0.9850746 0.1934066
## 5  350072 0.04557691   0.5   274 0.9820789 0.2007326
## 6  101145 0.13070356   0.6   495 0.9880240 0.3626374
## 7  400818 0.17537951   0.7   580 0.9880750 0.4249084
## 8  250967 0.50552849   0.8   987 0.9919598 0.7230769
## 9  200186 0.57165315   0.9  1052 0.9915174 0.7706960
## 10 150780 0.73300958   1.0  1171 0.9915326 0.8578755
# Area Under Curve (AUC)
par(mfrow=c(1,2),mar = c(5,5,5,2))
plot(myStat$FDR[,1],myStat$Power,type="b",  xlab = "FDR", ylab = "Power" )
plot(myStat$TypeI[,1],myStat$Power,type="b", xlab = "Type I error", ylab="Power")

Replicates: Average power vs Average FDR and TypeI error

# Replicate 100 times and plot the means
nrep=100
set.seed(99164)
statRep=replicate(nrep, {
mySim=G2P(X=myGD[,-1],h2=.5,alpha=1,NQTN=10,distribution="norm")
p=GWASbyCor(X=myGD[,-1],y=mySim$y)
seqQTN=mySim$QTN.position
myGWAS=cbind(myGM,t(p),NA)
myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position,GWAS=myGWAS,maxOut=100,MaxBP=1e10)
})
# str(statRep)

# Power
power=statRep[[2]]
# FDR
s.fdr=seq(3,length(statRep),7)
fdr=statRep[s.fdr]
fdr.mean=Reduce ("+", fdr) / length(fdr) #fdr is a list of vectors, Reduce("+", fdr) will sum the vectors element-wise.

#AUC
s.auc.fdr=seq(6,length(statRep),7)
auc.fdr=statRep[s.auc.fdr]
auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr)

# Plots of power vs FDR
theColor=rainbow(4)
# theColor=c("red","blue","green")
plot(fdr.mean[,1],power , type="b", col=theColor [1],xlim=c(0,1))
for(i in 2:ncol(fdr.mean)){
  lines(fdr.mean[,i], power , type="b", col= theColor [i])
}

# Plots of AUC
barplot(auc.fdr.mean, names.arg=c("1bp", "1K", "10K","100K"), xlab="Resolution", ylab="AUC")

ROC with different heritability

h2= 25% vs. 75% 10 QTNs Normal distributed QTN effect 100kb resolution Power against Type I error

nrep=100
set.seed(99164)

#h2=25%
statRep25=replicate(nrep, {
mySim=G2P(X=myGD[,-1],h2=.25,alpha=1,NQTN=10,distribution="norm")
p=GWASbyCor(X=myGD[,-1],y=mySim$y)
seqQTN=mySim$QTN.position
myGWAS=cbind(myGM,t(p),NA)
myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position,GWAS=myGWAS,maxOut=100,MaxBP=1e10)})

#h2=75%
statRep75=replicate(nrep, {
mySim=G2P(X=myGD[,-1],h2=.75,alpha=1,NQTN=10,distribution="norm")
p=GWASbyCor(X=myGD[,-1],y=mySim$y)
seqQTN=mySim$QTN.position
myGWAS=cbind(myGM,t(p),NA)
myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position,GWAS=myGWAS,maxOut=100,MaxBP=1e10)})

power25=statRep25[[2]]
s.t1=seq(4,length(statRep25),7)
t1=statRep25[s.t1]
t1.mean.25=Reduce ("+", t1) / length(t1)

power75=statRep75[[2]]
s.t1=seq(4,length(statRep75),7)
t1=statRep75[s.t1]
t1.mean.75=Reduce ("+", t1) / length(t1)

plot(t1.mean.25[,4],power25, type="b", col="blue",xlim=c(0,0.6))
lines(t1.mean.75[,4], power75, type="b", col= "red")