# Load data
myGD = read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt", head = T)
myGM = read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt", head = T)
source("http://zzlab.net/StaGen/2020/R/G2P.R")
source("http://zzlab.net/StaGen/2020/R/GWASbyCor.R")
# Subset the genetic data (myGD) by removing the first column
X = myGD[, -1]
# Create a logical index based on the chromosome number
index1to5 = myGM[,2] < 6
# This is used to select first half genome (chr 1 to 5) from X
X1to5 = X[,index1to5]
# Set the random seed for reproducibility, so the results are the same each time
set.seed(99164)
# Simulate phenotype using the G2P function:
mySim = G2P(X = X1to5, h2 = .75, alpha = 1, NQTN = 10, distribution = "norm")
# GWAS by correlation
p = GWASbyCor(X = X, y = mySim$y)
# Manhattan plot
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
# number of SNPs
m = nrow(myGM)
# Plot -log10(p) against SNPs
plot(t(-log10(p)) ~ seq(1:m), col = color.vector[myGM[, 2]]) # Color the points based on chromosome number
# True QTN position
abline(v = mySim$QTN.position, lty = 2, lwd = 2, col = "gray")
abline(h = -log10(0.01), lty = 1, lwd = 2, col = "red")
# Bonferroni correction
P = 0.01 / 3093
-log10(P)
## [1] 5.49038
# Add a horizontal line at
abline(h = -log10(P), lty = 1, lwd = 2, col = "red")
p.obs=p
m=length(p.obs)
hist(p.obs)
p.uni=runif(length(p.obs),0,1)
hist(p.uni)
p.exp=seq(m)/m
hist(p.exp)
order.uni=order(p.uni)
ps.uni=p.uni[order.uni]
plot(p.exp,ps.uni)
plot(-log10(p.exp),-log10(ps.uni))
order.obs=order(p.obs)
ps.obs=p.obs[order.obs]
plot(-log10(p.exp),-log10(ps.obs))
abline(a = 0, b = 1, col = "red")
p.obs=p[-mySim$QTN.positio]
m=length(p.obs)
p.exp=seq(m)/m
order.obs=order(p.obs)
ps.obs=p.obs[order.obs]
plot(-log10(p.exp),-log10(ps.obs))
abline(a = 0, b = 1, col = "red")
p.obs=p[!index1to5]
m=length(p.obs)
p.exp=seq(m)/m
order.obs=order(p.obs)
ps.obs=p.obs[order.obs]
plot(-log10(p.exp),-log10(ps.obs))
abline(a = 0, b = 1, col = "red")
type1=c(0.01, 0.05, 0.1, 0.2)
cutoff=quantile(p.obs,type1,na.rm=T)
cutoff
## 1% 5% 10% 20%
## 0.00212527 0.01637685 0.05601180 0.13322666
plot(type1, cutoff,type="b")
-log10(cutoff[1]) # Single test threshold
## 1%
## 2.672586
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(h=-log10(cutoff[1]) , lty = 2, lwd=2, col = "red")
Single test threshold: 𝛼=0.01 Number of test: m=3093 Threshold: 𝛼/𝑚= 3.23311E-06, or -log10(3.23311E-06)=5.49
# Bonferroni Multiple test threshold
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(h=-log10(0.01/m), lty = 2, lwd=2, col = "red")
Single test threshold: 𝛼=“0.00212527” Number of test: m=3093 Threshold: 𝛼/𝑚=6.871225E-07, or -log10(6.871225E-07)=6.16
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(h=-log10(cutoff[1]/m), lty = 2, lwd=2, col = "red")
# Bonferroni multiple test threshold with inflation correction
Bonferroni threshold leads to over correction if multiple tests are not independent, e.g. dense markers or long LD extent.
Empirical threshold
1. Randomly shuffle observed phenotypes to break the linkage with
genotypes 2. Conduct GWAS on the shuffled phenotypes and record the most
significant P value 3. Repeat 1-2 n (e.g. 1,000) times to get the
distribution of the minimum P values 4. Use the 1% percentile as the
threshold
set.seed(99164)
nRep=1000
P.array=matrix(NA, 1,nRep)
n=length(mySim$y)
for(i in c(1:nRep)){
theY=mySim$y[sample(n,n)]
p= GWASbyCor(X=X,y=theY)
theMinP=min(p,na.rm=T)
P.array[i]= theMinP
}
plot(density(-log10(P.array)))
plot(ecdf(-log10(P.array)))
th <- quantile(-log10(P.array),.99)
th
## 99%
## 5.357958
set.seed(99164)
# Simulate phenotype using the G2P function:
mySim = G2P(X = X1to5, h2 = .75, alpha = 1, NQTN = 10, distribution = "norm")
# GWAS by correlation
p = GWASbyCor(X = X, y = mySim$y)
# Empirical multiple test threshold, No inflation correction
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(h=th, lty = 2, lwd=2, col = "red")
# Empirical multiple test threshold with inflation correction
th <- quantile(-log10(P.array),1-cutoff[1])
th
## 99.78747%
## 5.995218
color.vector <- rep(c('#EC5f67', '#FAC863', '#99C794', '#6699CC', '#C594C5'),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(h=th, lty = 2, lwd=2, col = "red")
bigNum=1e9
resolution=100000
bin=round((myGM[,2]*bigNum+myGM[,3])/resolution)
result=cbind(myGM,t(p),bin)
head(result)
## SNP Chromosome Position t(p) bin
## PZB00859.1 PZB00859.1 1 157104 0.93326725 10002
## PZA01271.1 PZA01271.1 1 1947984 0.72817662 10019
## PZA03613.2 PZA03613.2 1 2914066 0.81923517 10029
## PZA03613.1 PZA03613.1 1 2914171 0.56670871 10029
## PZA03614.2 PZA03614.2 1 2915078 0.07868706 10029
## PZA03614.1 PZA03614.1 1 2915242 0.14958491 10029
tail(result)
## SNP Chromosome Position t(p) bin
## PZA00678.2 PZA00678.2 10 147762964 0.44774328 101478
## PZA00179.9 PZA00179.9 10 147819958 0.49223448 101478
## PZA02833.6 PZA02833.6 10 148332885 0.57258347 101483
## PHM1506.23 PHM1506.23 10 148488692 0.09820785 101485
## PZA03402.1 PZA03402.1 10 148816538 0.20554130 101488
## PZA03429.1 PZA03429.1 10 148907116 0.18355498 101489
# Bins of QTNs
QTN.bin=result[mySim$QTN.position,]
QTN.bin
## SNP Chromosome Position t(p) bin
## PZA03457.1 PZA03457.1 1 262715518 1.898106e-04 12627
## PZB01233.6 PZB01233.6 2 3376157 5.961492e-02 20034
## PZA00610.15 PZA00610.15 1 280215046 1.985976e-01 12802
## PZA00527.10 PZA00527.10 2 216833071 2.569003e-10 22168
## PZA00709.19 PZA00709.19 1 261859081 1.459755e-05 12619
## PZA00367.6 PZA00367.6 2 161879515 1.052209e-03 21619
## PZA03647.1 PZA03647.1 3 185318330 2.324493e-09 31853
## PZA02742.1 PZA02742.1 3 97441783 2.012597e-08 30974
## PZA03121.2 PZA03121.2 2 29977973 3.009048e-05 20300
## PZA01902.1 PZA01902.1 2 89520665 8.091195e-07 20895
# Sort bins of QTNs
index.qtn.p=order(QTN.bin[,4])
QTN.bin[index.qtn.p,]
## SNP Chromosome Position t(p) bin
## PZA00527.10 PZA00527.10 2 216833071 2.569003e-10 22168
## PZA03647.1 PZA03647.1 3 185318330 2.324493e-09 31853
## PZA02742.1 PZA02742.1 3 97441783 2.012597e-08 30974
## PZA01902.1 PZA01902.1 2 89520665 8.091195e-07 20895
## PZA00709.19 PZA00709.19 1 261859081 1.459755e-05 12619
## PZA03121.2 PZA03121.2 2 29977973 3.009048e-05 20300
## PZA03457.1 PZA03457.1 1 262715518 1.898106e-04 12627
## PZA00367.6 PZA00367.6 2 161879515 1.052209e-03 21619
## PZB01233.6 PZB01233.6 2 3376157 5.961492e-02 20034
## PZA00610.15 PZA00610.15 1 280215046 1.985976e-01 12802
# Load data
myGD = read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt", head = T)
myGM = read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt", head = T)
source("http://zzlab.net/StaGen/2020/R/G2P.R")
source("http://zzlab.net/StaGen/2020/R/GWASbyCor.R")
set.seed(99163)
mySim = G2P(X = myGD[, -1], h2 = .75, alpha = 1, NQTN = 10, distribution = "normal")
p = GWASbyCor(X = X, y = mySim$y)
# Manhattan plot
color.vector <- rep(c("deepskyblue", "orange", "forestgreen", "indianred3"), 10)
# number of SNPs
m = nrow(myGM)
# Plot -log10(p) against SNPs
plot(t(-log10(p)) ~ seq(1:m), col = color.vector[myGM[, 2]]) # Color the points based on chromosome number
# True QTN position
abline(v = mySim$QTN.position, lty = 2, lwd = 2, col = "gray")
source("http://zzlab.net/GAPIT/gapit_functions.txt")
set.seed(99164)
myStat=GAPIT.FDR.TypeI(
# WS- window size
#WS=c(1e0,1e3,1e4,1e5),
WS=1e5,
#Input: GM - m by 3 matrix for SNP name, chromosome and BP
GM=myGM,
#Input: seqQTN - s by 1 vecter for index of QTN on GM (+1 for GDP column wise)
seqQTN=mySim$QTN.position,
#Input: GWAS - SNP,CHR,BP,P,MAF
GWAS=result)
str(myStat)
## List of 7
## $ P : num [1, 1:10] 0.0172 0.0243 0.0281 0.0424 0.0456 ...
## $ Power : num [1:10] 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
## $ FDR : num [1:10, 1] 0.994 0.99 0.986 0.985 0.982 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:10] "fdr.record" "final.fdr" "final.fdr" "final.fdr" ...
## .. ..$ : NULL
## $ TypeI : num [1:10, 1] 0.123 0.145 0.156 0.193 0.201 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:10] "t1.record" "final.t1" "final.t1" "final.t1" ...
## .. ..$ : NULL
## $ False : int [1:10, 1] 168 198 213 264 274 495 580 987 1052 1171
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:10] "number.record" "final" "final" "final" ...
## .. ..$ : NULL
## $ AUC.FDR: num 0.0104
## $ AUC.T1 : num 0.553
WS=1e5
GWAS=result
seqQTN=mySim$QTN.position
MaxBP=1e10
total.index=1:nrow(myGM)
#result = GWAS
qtn.pool=ceiling((as.numeric(GWAS[seqQTN,2])*MaxBP+as.numeric(GWAS[seqQTN,3]))/(2*WS))
bonf.pool=ceiling((as.numeric(GWAS[total.index,2])*MaxBP+as.numeric(GWAS[total.index,3]))/(2*WS))
false.number=length(levels(factor(bonf.pool[!(bonf.pool%in%qtn.pool)])))
# Create a proper data frame
stat <- data.frame(
bin = qtn.pool,
P = t(myStat$P)[, 1],
Power = myStat$Power,
False = myStat$False[, 1], # Extract the first column from False
FDR = myStat$FDR[, 1], # Extract the first column from FDR
TypeI = myStat$TypeI[, 1] # Extract the first column from TypeI
)
# Check the result
print(stat)
## bin P Power False FDR TypeI
## 1 50223 0.01720321 0.1 168 0.9940828 0.1230769
## 2 200093 0.02434129 0.2 198 0.9900000 0.1450549
## 3 350376 0.02811636 0.3 213 0.9861111 0.1560440
## 4 200367 0.04235257 0.4 264 0.9850746 0.1934066
## 5 350072 0.04557691 0.5 274 0.9820789 0.2007326
## 6 101145 0.13070356 0.6 495 0.9880240 0.3626374
## 7 400818 0.17537951 0.7 580 0.9880750 0.4249084
## 8 250967 0.50552849 0.8 987 0.9919598 0.7230769
## 9 200186 0.57165315 0.9 1052 0.9915174 0.7706960
## 10 150780 0.73300958 1.0 1171 0.9915326 0.8578755
# Area Under Curve (AUC)
par(mfrow=c(1,2),mar = c(5,5,5,2))
plot(myStat$FDR[,1],myStat$Power,type="b", xlab = "FDR", ylab = "Power" )
plot(myStat$TypeI[,1],myStat$Power,type="b", xlab = "Type I error", ylab="Power")
# Replicate 100 times and plot the means
nrep=100
set.seed(99164)
statRep=replicate(nrep, {
mySim=G2P(X=myGD[,-1],h2=.5,alpha=1,NQTN=10,distribution="norm")
p=GWASbyCor(X=myGD[,-1],y=mySim$y)
seqQTN=mySim$QTN.position
myGWAS=cbind(myGM,t(p),NA)
myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position,GWAS=myGWAS,maxOut=100,MaxBP=1e10)
})
# str(statRep)
# Power
power=statRep[[2]]
# FDR
s.fdr=seq(3,length(statRep),7)
fdr=statRep[s.fdr]
fdr.mean=Reduce ("+", fdr) / length(fdr) #fdr is a list of vectors, Reduce("+", fdr) will sum the vectors element-wise.
#AUC
s.auc.fdr=seq(6,length(statRep),7)
auc.fdr=statRep[s.auc.fdr]
auc.fdr.mean=Reduce ("+", auc.fdr) / length(auc.fdr)
# Plots of power vs FDR
theColor=rainbow(4)
# theColor=c("red","blue","green")
plot(fdr.mean[,1],power , type="b", col=theColor [1],xlim=c(0,1))
for(i in 2:ncol(fdr.mean)){
lines(fdr.mean[,i], power , type="b", col= theColor [i])
}
# Plots of AUC
barplot(auc.fdr.mean, names.arg=c("1bp", "1K", "10K","100K"), xlab="Resolution", ylab="AUC")
h2= 25% vs. 75% 10 QTNs Normal distributed QTN effect 100kb resolution Power against Type I error
nrep=100
set.seed(99164)
#h2=25%
statRep25=replicate(nrep, {
mySim=G2P(X=myGD[,-1],h2=.25,alpha=1,NQTN=10,distribution="norm")
p=GWASbyCor(X=myGD[,-1],y=mySim$y)
seqQTN=mySim$QTN.position
myGWAS=cbind(myGM,t(p),NA)
myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position,GWAS=myGWAS,maxOut=100,MaxBP=1e10)})
#h2=75%
statRep75=replicate(nrep, {
mySim=G2P(X=myGD[,-1],h2=.75,alpha=1,NQTN=10,distribution="norm")
p=GWASbyCor(X=myGD[,-1],y=mySim$y)
seqQTN=mySim$QTN.position
myGWAS=cbind(myGM,t(p),NA)
myStat=GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM=myGM,seqQTN=mySim$QTN.position,GWAS=myGWAS,maxOut=100,MaxBP=1e10)})
power25=statRep25[[2]]
s.t1=seq(4,length(statRep25),7)
t1=statRep25[s.t1]
t1.mean.25=Reduce ("+", t1) / length(t1)
power75=statRep75[[2]]
s.t1=seq(4,length(statRep75),7)
t1=statRep75[s.t1]
t1.mean.75=Reduce ("+", t1) / length(t1)
plot(t1.mean.25[,4],power25, type="b", col="blue",xlim=c(0,0.6))
lines(t1.mean.75[,4], power75, type="b", col= "red")