QTNs 0n CHR 1-5, leave 6-10 empty

# simulate the phenotype with G2P function
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
source("http://zzlab.net/StaGen/2020/R/G2P.R")
source("http://zzlab.net/StaGen/2020/R/GWASbyCor.R")
X=myGD[,-1]
index1to5=myGM[,2]<6
X1to5 = X[,index1to5]
set.seed(99164)
mySim=G2P(X= X1to5,h2=.75,alpha=1,NQTN=10,distribution="norm")
p= GWASbyCor(X=X,y=mySim$y)

# Manhattan plot
color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])

# True QTN position
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "gray")

mySim$QTN.position
##  [1]  382  563  448  902  380  790 1185 1025  674  743
# Top 10 SNPs
index=order(p)
top10=index[1:10]
top10
##  [1]  902 1185 1025  743  157  380 1859 1866  674  158
abline(h=-log10(max(p[top10])), lty = 1, lwd=2, col = "black")

# Detected QTNs within top 10
detected=intersect(top10,mySim$QTN.position)
detected
## [1]  902 1185 1025  743  380  674
abline(v= detected, lty = 2, lwd=2, col = "blue")

# False positive within top 10
falsePositive=setdiff(top10, mySim$QTN.position)
falsePositive
## [1]  157 1859 1866  158
abline(v= falsePositive, lty = 2, lwd=2, col = "red")

QQ plot

# QQ plot
p.obs=p[!index1to5]
m2=length(p.obs)
p.uni=runif(m2,0,1)
order.obs=order(p.obs)
order.uni=order(p.uni)

plot(-log10(p.uni[order.uni]),-log10(p.obs[order.obs]))
abline(a = 0, b = 1, col = "red")

PCA in R using first 10 SNPs

# Perform Principal Component Analysis (PCA) on the first 10 columns of the dataset X
pca=prcomp(X[,1:10])
str(pca)
## List of 5
##  $ sdev    : num [1:10] 1.394 1.114 0.922 0.828 0.793 ...
##  $ rotation: num [1:10, 1:10] -0.0574 0.058 -0.5834 0.1601 0.5196 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:10] "PZB00859.1" "PZA01271.1" "PZA03613.2" "PZA03613.1" ...
##   .. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:10] 1.52 1.02 1.42 1.5 1.06 ...
##   ..- attr(*, "names")= chr [1:10] "PZB00859.1" "PZA01271.1" "PZA03613.2" "PZA03613.1" ...
##  $ scale   : logi FALSE
##  $ x       : num [1:281, 1:10] 1.45 2.05 1.98 1.78 2.05 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
# Create a heatmap of the scores (principal components for each observation)
# pca$x contains the scores (i.e., the transformed data in principal component space)
# Colv = NA and Rowv = NA prevent reordering of rows and columns (no hierarchical clustering)
heatmap(pca$x,Colv = NA,Rowv = NA)

# Create a heatmap of the loadings (coefficients of the principal components)
# pca$rotation contains eigenvectors
heatmap(pca$rotation,Colv = NA,Rowv = NA) 

# the eigenvalues (Also known as Variance Explained in PCA) can be obtained as the squared values of the standard deviations stored in pca$sdev
# var(pca$x) calculates the variance-covariance matrix of the principal component scores. The variance-covariance matrix shows the variance along each principal component (diagonal elements) and the covariances between different principal components (off-diagonal elements).
# Both pca$sdev^2 and diag(var(pca$x)) are used to calculate the variance explained by each principal component in PCA, but they are derived in different ways. 
plot(pca$sdev^2, diag(var(pca$x)))

PCA in R using all SNPs

PCA=prcomp(X)
str(PCA)
## List of 5
##  $ sdev    : num [1:281] 10.28 7.76 6.27 5.73 5.65 ...
##  $ rotation: num [1:3093, 1:281] 0.00103 0.02962 0.00296 0.03495 -0.01134 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3093] "PZB00859.1" "PZA01271.1" "PZA03613.2" "PZA03613.1" ...
##   .. ..$ : chr [1:281] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:3093] 1.52 1.02 1.42 1.5 1.06 ...
##   ..- attr(*, "names")= chr [1:3093] "PZB00859.1" "PZA01271.1" "PZA03613.2" "PZA03613.1" ...
##  $ scale   : logi FALSE
##  $ x       : num [1:281, 1:281] 1.68 -1.6 -0.9 2.13 0.63 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:281] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
heatmap(PCA$x,Colv = NA,Rowv = NA)

heatmap(PCA$rotation,Colv = NA,Rowv = NA)

plot(PCA$sdev^2, diag(var(PCA$x)))

# Extraction
dim(PCA$sdev)
## NULL
str((PCA$sdev))
##  num [1:281] 10.28 7.76 6.27 5.73 5.65 ...
dim(PCA$rotation)
## [1] 3093  281
dim(PCA$x)
## [1] 281 281
heatmap(PCA$x, Colv = NA)

PCA$x[1:10,1:5]
##               PC1        PC2         PC3        PC4         PC5
##  [1,]   1.6780775 -4.9373382   1.0029279  0.4804023  -0.2323756
##  [2,]  -1.6021749 -4.7322790  -0.6886187 -4.2666686   1.3539311
##  [3,]  -0.8999517 -6.2186090   2.2737655 -1.9902096   2.9406566
##  [4,]   2.1334477 -6.2879301   5.6837161 -2.8050425 -15.9185028
##  [5,]   0.6302372 -4.8947416   0.8189001 -2.1950980  -2.6457625
##  [6,] -13.6690754  0.8736302 -21.6324613  1.6339498  -2.3971126
##  [7,]  -0.5680841 -5.9401064   0.3307201 -4.5042693   2.9859048
##  [8,]   3.7670958  2.7504406   2.2384039 -1.6777158  -4.3525959
##  [9,]   5.4327297 -0.1516547   1.0950912 -2.1140016  -0.3760369
## [10,]  -0.9801181 -5.4792220   1.0430492 -5.0215953   4.4996895
# Contribution
pcavar =PCA$sdev^2
proportion= pcavar/sum(pcavar)
par(mfrow=c(1,4),mar = c(3,4,1,1))
plot(PCA$sdev)
plot(pcavar)
plot(ecdf(pcavar))
plot(proportion,type="b")

# Visualization
plot(PCA$x[,1],PCA$x[,2],col="red")

# Association with phenotypes
par(mfrow=c(2,1),mar = c(3,4,1,1))
plot(mySim$y,PCA$x[,1])
cor(mySim$y,PCA$x[,1])
##            [,1]
## [1,] 0.09621831
plot(mySim$y,PCA$x[,2])

cor(mySim$y,PCA$x[,2])
##             [,1]
## [1,] -0.08018205
pca1to5=prcomp(X[,index1to5])
pca6to10=prcomp(X[,!index1to5])

par(mfrow=c(2,2), mar = c(3,4,1,1))
plot(mySim$y, pca1to5$x[,1])
cor(mySim$y, pca1to5$x[,1])
##           [,1]
## [1,] -0.110093
plot(mySim$y, pca6to10$x[,1])
cor(mySim$y, pca6to10$x[,1])
##             [,1]
## [1,] -0.07469242
plot(mySim$y, pca1to5$x[,2])
cor(mySim$y, pca1to5$x[,2])
##            [,1]
## [1,] 0.09782618
plot(mySim$y, pca6to10$x[,2])

cor(mySim$y, pca6to10$x[,2])
##             [,1]
## [1,] -0.05796111

Phenotypes by genotypes

order.obs=order(p.obs)
X6to10=X[,!index1to5]
Xtop=X6to10[,order.obs[1]]

boxplot(mySim$y~Xtop)

Correlation

PCA=prcomp(X)
plot(mySim$y,PCA$x[,2])

cor(mySim$y,PCA$x[,2])
##             [,1]
## [1,] -0.08018205

Linear regression

set.seed(99164)
s=sample(length(mySim$y),10)
plot(mySim$y[s],PCA$x[s,2])

cor(mySim$y[s],PCA$x[s,2])
## [1] -0.4110317

Example from the ten individuals

cbind(mySim$y[s],1, PCA$x[s,2],Xtop[s])
##           [,1] [,2]       [,3] [,4]
##  [1,] 12.28858    1  -5.084303    0
##  [2,] 12.86865    1  -5.068997    0
##  [3,] 12.32488    1   1.128980    0
##  [4,] 11.16614    1  -4.937338    0
##  [5,] 10.56928    1 -11.103424    0
##  [6,] 14.05080    1  -7.739337    2
##  [7,] 11.46929    1   3.789764    0
##  [8,] 10.83520    1  -4.570223    0
##  [9,] 10.45742    1   5.248940    0
## [10,] 10.09067    1   7.063429    0

Action in R

# y is the response variable from the simulated data
y = mySim$y

# X is the design matrix. 
# First, we create a matrix with a column of 1's (intercept term), 
# then the second column of PCA$x (the second principal component) 
# and finally the Xtop variable, the most significant marker.
X = cbind(1, PCA$x[,2], Xtop)

# LHS is the left-hand side of the normal equation for linear regression: X'X
LHS = t(X) %*% X

# C is the covariance matrix of the regression coefficients, computed as (X'X)^(-1)
# The `solve()` function computes the inverse of a matrix, so we get the covariance matrix here.
C = solve(LHS)

# RHS is the right-hand side of the normal equation for linear regression: X'y
RHS = t(X) %*% y

# b is the vector of estimated regression coefficients. 
# It is computed as (X'X)^(-1) * X'y, which is equivalent to solving for b in the normal equation.
b = C %*% RHS

# yb is the predicted values (fitted values) from the regression. 
# This is computed by multiplying the design matrix X with the regression coefficients b.
yb = X %*% b

# e is the residuals or errors. 
# This is the difference between the observed values (y) and the predicted values (yb).
e = y - yb

# n is the sample size (number of observations).
n = length(y)

# ve is the residual variance, calculated as the sum of squared residuals divided by (n-1).
# This is an estimate of the variance of the error terms in the regression model.
ve = sum(e^2) / (n - 1)

# vt is the variance-covariance matrix of the regression coefficients. 
# It is computed as the residual variance (ve) multiplied by the covariance matrix C.
# This matrix gives the variance of the estimated coefficients.
vt = C * ve

# t is the t-statistic for each coefficient. 
# It is computed as the estimated coefficients (b) divided by the standard error (sqrt(diag(vt))).
# The standard error is the square root of the diagonal elements of the variance-covariance matrix.
t = b / sqrt(diag(vt))

# p is the two-tailed p-value for each coefficient. 
# It is computed using the t-distribution with n-2 degrees of freedom. 
# The p-value tells us if the corresponding coefficient is significantly different from zero.
p = 2 * (1 - pt(abs(t), n - 2))
# Phenotypes by genotypes
LM=cbind(b, t, sqrt(diag(vt)), p)
rownames(LM)=cbind("Mean", "PC2","Xtop")
colnames(LM)=cbind("b", "t", "SD","p")
LM
##                 b          t         SD            p
## Mean 10.373413962 62.1283694 0.16696743 0.0000000000
## PC2  -0.008410043 -0.4091586 0.02055448 0.6827372269
## Xtop  0.827778718  3.5759778 0.23148318 0.0004111298

b: The estimated coefficients for the intercept (Mean), PC2, and Xtop.
t: The t-statistics for testing whether each coefficient is significantly different from zero.
SD: The standard errors of the coefficients.
p: The p-values for testing the significance of each coefficient.

Loop through genome

# G is the matrix of data from 'myGD' (presumably a genotype matrix or similar).
# It removes the first column, which may be an ID or other irrelevant information.
G = myGD[, -1]

# 'n' is the number of rows (samples/observations).
n = nrow(G)

# 'm' is the number of columns (markers or variables).
m = ncol(G)

# Initialize a vector 'P' to store the p-values for each marker. 
# Initially, all values are set to NA.
P = matrix(NA, 1, m)

# Loop over each marker (i-th column of G)
for (i in 1:m){
  
  # Extract the data for the i-th marker (x) from the matrix G.
  x = G[, i]
  
  # Check if all the values in x are the same. If true, p-value is set to 1 (no variation).
  if(max(x) == min(x)){
    p = 1
  } else {
    # If there is variation, proceed with the regression analysis.
    # Create a design matrix X with a column of 1's (for intercept), 
    # the second principal component (PCA$x[,2]), and the marker data (x).
    X = cbind(1, PCA$x[,2], x)
    
    # Calculate the left-hand side of the normal equation (X'X).
    LHS = t(X) %*% X
    
    # Calculate the covariance matrix C as the inverse of LHS (X'X)^(-1).
    C = solve(LHS)
    
    # Calculate the right-hand side of the normal equation (X'y).
    RHS = t(X) %*% y
    
    # Estimate the regression coefficients (b).
    b = C %*% RHS
    
    # Predicted values (fitted values) from the regression model.
    yb = X %*% b
    
    # Calculate the residuals (e = y - yb).
    e = y - yb
    
    # n is the number of samples (y length).
    n = length(y)
    
    # ve is the residual variance, calculated as the sum of squared residuals divided by (n - 1).
    ve = sum(e^2) / (n - 1)
    
    # vt is the variance-covariance matrix of the estimated coefficients, 
    # calculated as C * residual variance (ve).
    vt = C * ve
    
    # t is the t-statistic for each coefficient, calculated as b / standard error.
    # The standard error is the square root of the diagonal elements of vt.
    t = b / sqrt(diag(vt))
    
    # p is the p-value for the last coefficient, based on the t-statistic and degrees of freedom (n-2).
    p = 2 * (1 - pt(abs(t), n - 2))
  }
  
  # Store the p-value for the i-th marker in the P vector.
  P[i] = p[length(p)]
}

# End of looping over markers
#QQ plot
p.obs=P[!index1to5]
m2=length(p.obs)
p.uni=runif(m2,0,1)
order.obs=order(p.obs)
order.uni=order(p.uni)

plot(-log10(p.uni[order.uni]),
-log10(p.obs[order.obs]), ylim=c(0,7))
abline(a = 0, b = 1, col = "red")

Using three PCs

G=myGD[,-1]
n=nrow(G)
m=ncol(G)
P=matrix(NA,1,m)
for (i in 1:m){
x=G[,i]
if(max(x)==min(x)){
p=1}else{
X=cbind(1, PCA$x[,1:3],x)
LHS=t(X)%*%X
C=solve(LHS)
RHS=t(X)%*%y
b=C%*%RHS
yb=X%*%b
e=y-yb
n=length(y)
ve=sum(e^2)/(n-1)
vt=C*ve
t=b/sqrt(diag(vt))
p=2*(1-pt(abs(t),n-2))
} #end of testing variation
P[i]=p[length(p)]
} #end of looping for markers

# QQ Plot
# GLM with PC2
p.obs=P[!index1to5]
m2=length(p.obs)
p.uni=runif(m2,0,1)
order.obs=order(p.obs)
order.uni=order(p.uni)
# GLM with PC1:3
plot(-log10(p.uni[order.uni]),
-log10(p.obs[order.obs]), ylim=c(0,7))
abline(a = 0, b = 1, col = "red")

#

p.obs=P
m2=length(p.obs)
p.uni=runif(m2,0,1)
order.obs=order(p.obs)
order.uni=order(p.uni)

plot(-log10(p.uni[order.uni]),
-log10(p.obs[order.obs]), )
abline(a = 0, b = 1, col = "red")

#
color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10)
m=nrow(myGM)
plot(t(-log10(P))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")