# simulate the phenotype with G2P function
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
source("http://zzlab.net/StaGen/2020/R/G2P.R")
source("http://zzlab.net/StaGen/2020/R/GWASbyCor.R")
X=myGD[,-1]
index1to5=myGM[,2]<6
X1to5 = X[,index1to5]
set.seed(99164)
mySim=G2P(X= X1to5,h2=.75,alpha=1,NQTN=10,distribution="norm")
p= GWASbyCor(X=X,y=mySim$y)
# Manhattan plot
color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
# True QTN position
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "gray")
mySim$QTN.position
## [1] 382 563 448 902 380 790 1185 1025 674 743
# Top 10 SNPs
index=order(p)
top10=index[1:10]
top10
## [1] 902 1185 1025 743 157 380 1859 1866 674 158
abline(h=-log10(max(p[top10])), lty = 1, lwd=2, col = "black")
# Detected QTNs within top 10
detected=intersect(top10,mySim$QTN.position)
detected
## [1] 902 1185 1025 743 380 674
abline(v= detected, lty = 2, lwd=2, col = "blue")
# False positive within top 10
falsePositive=setdiff(top10, mySim$QTN.position)
falsePositive
## [1] 157 1859 1866 158
abline(v= falsePositive, lty = 2, lwd=2, col = "red")
# QQ plot
p.obs=p[!index1to5]
m2=length(p.obs)
p.uni=runif(m2,0,1)
order.obs=order(p.obs)
order.uni=order(p.uni)
plot(-log10(p.uni[order.uni]),-log10(p.obs[order.obs]))
abline(a = 0, b = 1, col = "red")
# Perform Principal Component Analysis (PCA) on the first 10 columns of the dataset X
pca=prcomp(X[,1:10])
str(pca)
## List of 5
## $ sdev : num [1:10] 1.394 1.114 0.922 0.828 0.793 ...
## $ rotation: num [1:10, 1:10] -0.0574 0.058 -0.5834 0.1601 0.5196 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:10] "PZB00859.1" "PZA01271.1" "PZA03613.2" "PZA03613.1" ...
## .. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:10] 1.52 1.02 1.42 1.5 1.06 ...
## ..- attr(*, "names")= chr [1:10] "PZB00859.1" "PZA01271.1" "PZA03613.2" "PZA03613.1" ...
## $ scale : logi FALSE
## $ x : num [1:281, 1:10] 1.45 2.05 1.98 1.78 2.05 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
# Create a heatmap of the scores (principal components for each observation)
# pca$x contains the scores (i.e., the transformed data in principal component space)
# Colv = NA and Rowv = NA prevent reordering of rows and columns (no hierarchical clustering)
heatmap(pca$x,Colv = NA,Rowv = NA)
# Create a heatmap of the loadings (coefficients of the principal components)
# pca$rotation contains eigenvectors
heatmap(pca$rotation,Colv = NA,Rowv = NA)
# the eigenvalues (Also known as Variance Explained in PCA) can be obtained as the squared values of the standard deviations stored in pca$sdev
# var(pca$x) calculates the variance-covariance matrix of the principal component scores. The variance-covariance matrix shows the variance along each principal component (diagonal elements) and the covariances between different principal components (off-diagonal elements).
# Both pca$sdev^2 and diag(var(pca$x)) are used to calculate the variance explained by each principal component in PCA, but they are derived in different ways.
plot(pca$sdev^2, diag(var(pca$x)))
PCA=prcomp(X)
str(PCA)
## List of 5
## $ sdev : num [1:281] 10.28 7.76 6.27 5.73 5.65 ...
## $ rotation: num [1:3093, 1:281] 0.00103 0.02962 0.00296 0.03495 -0.01134 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3093] "PZB00859.1" "PZA01271.1" "PZA03613.2" "PZA03613.1" ...
## .. ..$ : chr [1:281] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:3093] 1.52 1.02 1.42 1.5 1.06 ...
## ..- attr(*, "names")= chr [1:3093] "PZB00859.1" "PZA01271.1" "PZA03613.2" "PZA03613.1" ...
## $ scale : logi FALSE
## $ x : num [1:281, 1:281] 1.68 -1.6 -0.9 2.13 0.63 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:281] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
heatmap(PCA$x,Colv = NA,Rowv = NA)
heatmap(PCA$rotation,Colv = NA,Rowv = NA)
plot(PCA$sdev^2, diag(var(PCA$x)))
# Extraction
dim(PCA$sdev)
## NULL
str((PCA$sdev))
## num [1:281] 10.28 7.76 6.27 5.73 5.65 ...
dim(PCA$rotation)
## [1] 3093 281
dim(PCA$x)
## [1] 281 281
heatmap(PCA$x, Colv = NA)
PCA$x[1:10,1:5]
## PC1 PC2 PC3 PC4 PC5
## [1,] 1.6780775 -4.9373382 1.0029279 0.4804023 -0.2323756
## [2,] -1.6021749 -4.7322790 -0.6886187 -4.2666686 1.3539311
## [3,] -0.8999517 -6.2186090 2.2737655 -1.9902096 2.9406566
## [4,] 2.1334477 -6.2879301 5.6837161 -2.8050425 -15.9185028
## [5,] 0.6302372 -4.8947416 0.8189001 -2.1950980 -2.6457625
## [6,] -13.6690754 0.8736302 -21.6324613 1.6339498 -2.3971126
## [7,] -0.5680841 -5.9401064 0.3307201 -4.5042693 2.9859048
## [8,] 3.7670958 2.7504406 2.2384039 -1.6777158 -4.3525959
## [9,] 5.4327297 -0.1516547 1.0950912 -2.1140016 -0.3760369
## [10,] -0.9801181 -5.4792220 1.0430492 -5.0215953 4.4996895
# Contribution
pcavar =PCA$sdev^2
proportion= pcavar/sum(pcavar)
par(mfrow=c(1,4),mar = c(3,4,1,1))
plot(PCA$sdev)
plot(pcavar)
plot(ecdf(pcavar))
plot(proportion,type="b")
# Visualization
plot(PCA$x[,1],PCA$x[,2],col="red")
# Association with phenotypes
par(mfrow=c(2,1),mar = c(3,4,1,1))
plot(mySim$y,PCA$x[,1])
cor(mySim$y,PCA$x[,1])
## [,1]
## [1,] 0.09621831
plot(mySim$y,PCA$x[,2])
cor(mySim$y,PCA$x[,2])
## [,1]
## [1,] -0.08018205
pca1to5=prcomp(X[,index1to5])
pca6to10=prcomp(X[,!index1to5])
par(mfrow=c(2,2), mar = c(3,4,1,1))
plot(mySim$y, pca1to5$x[,1])
cor(mySim$y, pca1to5$x[,1])
## [,1]
## [1,] -0.110093
plot(mySim$y, pca6to10$x[,1])
cor(mySim$y, pca6to10$x[,1])
## [,1]
## [1,] -0.07469242
plot(mySim$y, pca1to5$x[,2])
cor(mySim$y, pca1to5$x[,2])
## [,1]
## [1,] 0.09782618
plot(mySim$y, pca6to10$x[,2])
cor(mySim$y, pca6to10$x[,2])
## [,1]
## [1,] -0.05796111
order.obs=order(p.obs)
X6to10=X[,!index1to5]
Xtop=X6to10[,order.obs[1]]
boxplot(mySim$y~Xtop)
PCA=prcomp(X)
plot(mySim$y,PCA$x[,2])
cor(mySim$y,PCA$x[,2])
## [,1]
## [1,] -0.08018205
set.seed(99164)
s=sample(length(mySim$y),10)
plot(mySim$y[s],PCA$x[s,2])
cor(mySim$y[s],PCA$x[s,2])
## [1] -0.4110317
cbind(mySim$y[s],1, PCA$x[s,2],Xtop[s])
## [,1] [,2] [,3] [,4]
## [1,] 12.28858 1 -5.084303 0
## [2,] 12.86865 1 -5.068997 0
## [3,] 12.32488 1 1.128980 0
## [4,] 11.16614 1 -4.937338 0
## [5,] 10.56928 1 -11.103424 0
## [6,] 14.05080 1 -7.739337 2
## [7,] 11.46929 1 3.789764 0
## [8,] 10.83520 1 -4.570223 0
## [9,] 10.45742 1 5.248940 0
## [10,] 10.09067 1 7.063429 0
# y is the response variable from the simulated data
y = mySim$y
# X is the design matrix.
# First, we create a matrix with a column of 1's (intercept term),
# then the second column of PCA$x (the second principal component)
# and finally the Xtop variable, the most significant marker.
X = cbind(1, PCA$x[,2], Xtop)
# LHS is the left-hand side of the normal equation for linear regression: X'X
LHS = t(X) %*% X
# C is the covariance matrix of the regression coefficients, computed as (X'X)^(-1)
# The `solve()` function computes the inverse of a matrix, so we get the covariance matrix here.
C = solve(LHS)
# RHS is the right-hand side of the normal equation for linear regression: X'y
RHS = t(X) %*% y
# b is the vector of estimated regression coefficients.
# It is computed as (X'X)^(-1) * X'y, which is equivalent to solving for b in the normal equation.
b = C %*% RHS
# yb is the predicted values (fitted values) from the regression.
# This is computed by multiplying the design matrix X with the regression coefficients b.
yb = X %*% b
# e is the residuals or errors.
# This is the difference between the observed values (y) and the predicted values (yb).
e = y - yb
# n is the sample size (number of observations).
n = length(y)
# ve is the residual variance, calculated as the sum of squared residuals divided by (n-1).
# This is an estimate of the variance of the error terms in the regression model.
ve = sum(e^2) / (n - 1)
# vt is the variance-covariance matrix of the regression coefficients.
# It is computed as the residual variance (ve) multiplied by the covariance matrix C.
# This matrix gives the variance of the estimated coefficients.
vt = C * ve
# t is the t-statistic for each coefficient.
# It is computed as the estimated coefficients (b) divided by the standard error (sqrt(diag(vt))).
# The standard error is the square root of the diagonal elements of the variance-covariance matrix.
t = b / sqrt(diag(vt))
# p is the two-tailed p-value for each coefficient.
# It is computed using the t-distribution with n-2 degrees of freedom.
# The p-value tells us if the corresponding coefficient is significantly different from zero.
p = 2 * (1 - pt(abs(t), n - 2))
# Phenotypes by genotypes
LM=cbind(b, t, sqrt(diag(vt)), p)
rownames(LM)=cbind("Mean", "PC2","Xtop")
colnames(LM)=cbind("b", "t", "SD","p")
LM
## b t SD p
## Mean 10.373413962 62.1283694 0.16696743 0.0000000000
## PC2 -0.008410043 -0.4091586 0.02055448 0.6827372269
## Xtop 0.827778718 3.5759778 0.23148318 0.0004111298
b: The estimated coefficients for the intercept (Mean), PC2, and
Xtop.
t: The t-statistics for testing whether each coefficient is
significantly different from zero.
SD: The standard errors of the coefficients.
p: The p-values for testing the significance of each coefficient.
# G is the matrix of data from 'myGD' (presumably a genotype matrix or similar).
# It removes the first column, which may be an ID or other irrelevant information.
G = myGD[, -1]
# 'n' is the number of rows (samples/observations).
n = nrow(G)
# 'm' is the number of columns (markers or variables).
m = ncol(G)
# Initialize a vector 'P' to store the p-values for each marker.
# Initially, all values are set to NA.
P = matrix(NA, 1, m)
# Loop over each marker (i-th column of G)
for (i in 1:m){
# Extract the data for the i-th marker (x) from the matrix G.
x = G[, i]
# Check if all the values in x are the same. If true, p-value is set to 1 (no variation).
if(max(x) == min(x)){
p = 1
} else {
# If there is variation, proceed with the regression analysis.
# Create a design matrix X with a column of 1's (for intercept),
# the second principal component (PCA$x[,2]), and the marker data (x).
X = cbind(1, PCA$x[,2], x)
# Calculate the left-hand side of the normal equation (X'X).
LHS = t(X) %*% X
# Calculate the covariance matrix C as the inverse of LHS (X'X)^(-1).
C = solve(LHS)
# Calculate the right-hand side of the normal equation (X'y).
RHS = t(X) %*% y
# Estimate the regression coefficients (b).
b = C %*% RHS
# Predicted values (fitted values) from the regression model.
yb = X %*% b
# Calculate the residuals (e = y - yb).
e = y - yb
# n is the number of samples (y length).
n = length(y)
# ve is the residual variance, calculated as the sum of squared residuals divided by (n - 1).
ve = sum(e^2) / (n - 1)
# vt is the variance-covariance matrix of the estimated coefficients,
# calculated as C * residual variance (ve).
vt = C * ve
# t is the t-statistic for each coefficient, calculated as b / standard error.
# The standard error is the square root of the diagonal elements of vt.
t = b / sqrt(diag(vt))
# p is the p-value for the last coefficient, based on the t-statistic and degrees of freedom (n-2).
p = 2 * (1 - pt(abs(t), n - 2))
}
# Store the p-value for the i-th marker in the P vector.
P[i] = p[length(p)]
}
# End of looping over markers
#QQ plot
p.obs=P[!index1to5]
m2=length(p.obs)
p.uni=runif(m2,0,1)
order.obs=order(p.obs)
order.uni=order(p.uni)
plot(-log10(p.uni[order.uni]),
-log10(p.obs[order.obs]), ylim=c(0,7))
abline(a = 0, b = 1, col = "red")
G=myGD[,-1]
n=nrow(G)
m=ncol(G)
P=matrix(NA,1,m)
for (i in 1:m){
x=G[,i]
if(max(x)==min(x)){
p=1}else{
X=cbind(1, PCA$x[,1:3],x)
LHS=t(X)%*%X
C=solve(LHS)
RHS=t(X)%*%y
b=C%*%RHS
yb=X%*%b
e=y-yb
n=length(y)
ve=sum(e^2)/(n-1)
vt=C*ve
t=b/sqrt(diag(vt))
p=2*(1-pt(abs(t),n-2))
} #end of testing variation
P[i]=p[length(p)]
} #end of looping for markers
# QQ Plot
# GLM with PC2
p.obs=P[!index1to5]
m2=length(p.obs)
p.uni=runif(m2,0,1)
order.obs=order(p.obs)
order.uni=order(p.uni)
# GLM with PC1:3
plot(-log10(p.uni[order.uni]),
-log10(p.obs[order.obs]), ylim=c(0,7))
abline(a = 0, b = 1, col = "red")
#
p.obs=P
m2=length(p.obs)
p.uni=runif(m2,0,1)
order.obs=order(p.obs)
order.uni=order(p.uni)
plot(-log10(p.uni[order.uni]),
-log10(p.obs[order.obs]), )
abline(a = 0, b = 1, col = "red")
#
color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10)
m=nrow(myGM)
plot(t(-log10(P))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")