1. Phenotype Simulation

1.1 More genes, more normal

# Set up 4x4 plotting layout
par(mfrow = c(4, 4))  

# Function to generate and plot data
plot_gene_expression <- function(m, n) {
  x = runif(m)  # Generate gene effects between 0 and 1
  gene = matrix(x, n, m, byrow = TRUE)  # Create gene matrix with identical rows
  
  # Generate random binary gene expression data
  galton = matrix(runif(n * m), n, m)
  galton.binary = galton < 0.5  # Convert to binary (dominant/recessive)
  
  # Modify gene matrix
  gene[galton.binary] = 0  # Set genes to 0 where recessive
  
  # Calculate total gene expression per individual
  y = rowSums(gene)
  
  # Plot histogram
  hist(y, main = paste("Histogram (m =", m, ")"), col = "skyblue", border = "white")
  
  # Plot density
  plot(density(y), main = paste("Density Plot (m =", m, ")"), col = "blue", lwd = 2)
}

# Generate plots for m = 10
for (i in 1:8) {
  plot_gene_expression(m = 10, n = 1000)
}

# Generate plots for m = 100
for (i in 1:8) {
  plot_gene_expression(m = 100, n = 1000)
}

1.2 Phenotype Simulation Function

Components of Phenotypic Variation

The phenotype (\(Y\)) of an individual is influenced by genetic and environmental factors, as well as their interactions and residual effects:

\[ Y = G + E + G \times E + \text{Residual} \]

where:

  • \(G\) (Genetic effect) consists of:

    • Additive effects: The sum of individual allele effects.
    • Dominance effects: Interactions between alleles at the same locus.
    • Epistasis: Interactions between alleles at different loci.
  • \(E\) (Environmental effect): External factors such as year and location.

  • \(G \times E\) (Genotype-by-environment interaction): The variation in genetic effects across different environments.

  • Residual: Includes unexplained variation such as measurement error.

# Load Data
myGD = read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt", head = T)
myGM = read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt", head = T)
# Subset the genetic data (myGD) by removing the first column
X = myGD[, -1]
  • X: A matrix where rows represent individuals (n) and columns represent genetic markers (m).
  • h2: The heritability of the trait (a value between 0 and 1), which represents the proportion of phenotypic variance explained by genetic factors.
  • alpha: A scaling factor used for generating additive effects when distribution is not normal.
  • NQTN: Number of Quantitative Trait Nucleotides (QTN), i.e., the number of genetic markers assumed to influence the trait.
  • distribution: A string specifying the distribution of QTN effects (either “norm” for normal distribution or another choice for exponential decay).
G2P = function(X, h2, alpha, NQTN, distribution) {
  n = nrow(X)  # Number of individuals (rows)
  m = ncol(X)  # Number of SNP markers (columns)
  
  # Sampling QTN
  QTN.position = sample(m, NQTN, replace = F)  # NQTN: number of QTNs
  SNPQ = as.matrix(X[, QTN.position])  # Genotype of allt the QTNs
  QTN.position
  
  # QTN effects
  if (distribution == "norm") {  
    addeffect = rnorm(NQTN, 0, 1) # The gene effects of the QTNs are drawn from a normal distribution with mean 0 and variance 1
  } else {  
    addeffect = alpha^(1:NQTN) # Otherwise, the gene effects follow Geometry distribution
  }
  
  # Simulate phenotype
  effect = SNPQ %*% addeffect  # Calculates genetic/additive effect
  effectvar = var(effect)
  residualvar = (effectvar - h2 * effectvar) / h2  # Calculates the residual variance based on the given heritability formula
  residual = rnorm(n, 0, sqrt(residualvar))  # Simulates residual effects based on its variance
  y = effect + residual  # The final phenotype (y) is computed by adding the genetic effect and residual noise
  
  return(list(addeffect = addeffect, y = y, add = effect, residual = residual, QTN.position = QTN.position, SNPQ = SNPQ))
}

1.3 VA & Heritability

set.seed(1234567)
# Set up the plotting area with 3 rows and 1 column of plots
# Adjust margins: bottom (3), left (4), top (1), right (1)
par(mfrow = c(1, 3), mar = c(3, 4, 1, 1))

# Run G2P function 
myG2P = G2P(X, .75, 1, 10, "norm")  # Simulating phenotype with h2 = 0.75, alpha = 1, NQTN = 10
str(myG2P)  # Display structure of the result
## List of 6
##  $ addeffect   : num [1:10] 0.91 -0.919 -0.158 1.102 1.542 ...
##  $ y           : num [1:281, 1] 1.269 -1.574 0.126 -1.116 -5.015 ...
##  $ add         : num [1:281, 1] 1.108 -1.3807 0.0654 -0.0253 -5.8364 ...
##  $ residual    : num [1:281] 0.1607 -0.1933 0.0609 -1.0907 0.821 ...
##  $ QTN.position: int [1:10] 2556 2638 1947 1148 2789 1697 2688 59 2471 1088
##  $ SNPQ        : int [1:281, 1:10] 2 0 2 2 0 2 0 2 0 2 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:10] "PZA03610.1" "PZA03593.1" "PZA00500.3" "PZB01473.3" ...
# Calculate variance components: additive, residual, and total phenotype
va = var(myG2P$add)         # Additive genetic variance
ve = var(myG2P$residual)    # Environmental (residual) variance
vp = var(myG2P$y)           # Total phenotypic variance

# Store variance components in a matrix
v = matrix(c(va, ve, vp), 1, 3)
colnames(v) = c("A", "E", "P")  # Label columns: A = Additive, E = Environmental, P = Phenotypic

# Create a bar plot of variance components for h2 = 0.75
barplot(v, col = "gray")


# Repeat the process for h2 = 0.5
myG2P = G2P(X, .5, 1, 10, "norm")  # Simulating phenotype with h2 = 0.5
va = var(myG2P$add)     
ve = var(myG2P$residual)
vp = var(myG2P$y)

# Store variance components in a matrix
v = matrix(c(va, ve, vp), 1, 3)
colnames(v) = c("A", "E", "P")  

# Create a bar plot of variance components for h2 = 0.5
barplot(v, col = "gray")


# Repeat the process for h2 = 0.25
myG2P = G2P(X, .25, 1, 10, "norm")  # Simulating phenotype with h2 = 0.25
va = var(myG2P$add)     
ve = var(myG2P$residual)
vp = var(myG2P$y)

# Store variance components in a matrix
v = matrix(c(va, ve, vp), 1, 3)
colnames(v) = c("A", "E", "P")  

# Create a bar plot of variance components for h2 = 0.25
barplot(v, col = "gray")

1.4 Check gene effect distribution

set.seed(1234567)
# Check gene effect distribution with different alpha values
par(mfrow = c(3, 3), mar = c(3, 4, 1, 1))  # Set up a 3x3 plotting layout

# Define alpha values and corresponding colors
alpha_values <- c(1, 0.95, 0.5)
effect_types <- c("Additive Effect", "Residual Effect", "Total Phenotype")
colors <- c("deepskyblue3", "tomato3", "black")  # Distinct colors for each effect type

# Loop through different alpha values and generate plots
for (alpha in alpha_values) {
  myG2P = G2P(X, h2 = 0.5, alpha = alpha, NQTN = 10, distribution = "geom")
  
  # Extract components and plot histograms
  data_list <- list(myG2P$add, myG2P$residual, myG2P$y)
  for (i in 1:3) {
    hist(
      data_list[[i]], 
      main = paste(effect_types[i], "(α =", alpha, ")"), 
      col = colors[i], 
      border = "gray", 
      xlab = "Value", 
      ylab = "Frequency"
    )
  }
}

The trend reflects how well additive genetic effects explain phenotypic variation. Larger gene effects tighten the alignment between phenotype and additive effect, while smaller effects inflate residual variance.

1.5 Effect of Number of Genes (NQTN)s

set.seed(1234567890)
# Check on distribution
par(mfrow = c(3, 3), mar = c(3, 4, 1, 1))  # Set up a 3x3 plotting layout

# Define QTN values and labels
QTN_values <- c(2, 10, 100)
effect_types <- c("Additive Effect", "Residual Effect", "Total Phenotype")

# Colors for better visualization
colors <- c("deepskyblue3", "tomato3", "black")

# Loop through different QTN values and generate histograms
for (QTN in QTN_values) {
  myG2P = G2P(X, 0.5, 1, QTN, "norm")  # Simulate phenotype
  
  # Extract components and plot histograms
  data_list <- list(myG2P$add, myG2P$residual, myG2P$y)
  for (i in 1:3) {
    hist(
      data_list[[i]], 
      main = paste(effect_types[i], "(", QTN, "QTNs)"), 
      col = colors[i], 
      border = "gray", 
      xlab = "Value", 
      ylab = "Frequency"
    )
  }
}

# Set up density plot layout
par(mfrow = c(3, 1))

# Loop through different QTN values and generate density plots
for (QTN in QTN_values) {
  myG2P = G2P(X, 0.5, 1, QTN, "norm")  # Simulate phenotype
  
  # Compute densities
  dens_y <- density(myG2P$y)
  dens_add <- density(myG2P$add)
  dens_residual <- density(myG2P$residual)
  
  # Determine maximum and minimum values across all densities for proper scaling
  max_density <- max(dens_y$y, dens_add$y, dens_residual$y)
  min_density <- min(dens_y$y, dens_add$y, dens_residual$y)
  
  # Determine maximum and minimum x-values across all densities
  max_x <- max(dens_y$x, dens_add$x, dens_residual$x)
  min_x <- min(dens_y$x, dens_add$x, dens_residual$x)
  
  # Set dynamic xlim and ylim to fit all density curves
  xlim_range <- c(min_x, max_x)
  ylim_range <- c(min_density, max_density * 1.1)  # Add slight margin for clarity
  
  # Plot density with dynamically scaled xlim and ylim
  plot(
    dens_y, 
    xlim = xlim_range,  # Set dynamic xlim
    ylim = ylim_range,  # Set dynamic ylim
    main = paste(QTN, "QTNs"), 
    col = "gray", 
    lwd = 2
  )
  
  # Add lines for different genetic effects
  lines(dens_add, col = "deepskyblue3", lwd = 2)      # Additive effect
  lines(dens_residual, col = "tomato3", lwd = 2)  # Residual effect
}

As the number of QTNs increases, the additive effect (blue) becomes a more prominent predictor of the phenotype (black), while the residual (red) decreases, indicating that a greater proportion of the phenotypic variation is explained by the genetic model. This trend suggests that incorporating more QTNs enhances the model’s ability to better capture the genetic architecture underlying the trait.

2. Correlation

2.1 Approximation of t distribution

# Simulate the t-statistics for correlations: 
cort = function(n=10000, df=100) {
  z = replicate(n, {
    x = rnorm(df + 2)   # Generate a random sample of size df + 2 from a normal distribution
    y = rnorm(df + 2)   # Generate another random sample of size df + 2 from a normal distribution
    r = cor(x, y)       # Compute the Pearson correlation coefficient between x and y
    t = r / sqrt((1 - r^2) / df)  # Calculate the t-statistic for the correlation
  })
  return(z)  # Return the vector z contains the t-statistics for all n iterations and is returned.
}

# Simulate t-statistics (x) from random data:
x = cort(10000, 5)
# Generate the theoretical t-distribution
t = rt(100000, 5)
# Visualize the distributions
plot(density(x), col = "blue")
lines(density(t), col = "red")

By comparing the two distributions, you can observe how well the simulated t-statistics match the theoretical t-distribution. In this case, the simulated t-statistics should resemble the t-distribution with 5 degrees of freedom.

2.2 Influence of DF

set.seed(1234567)
# Set up 3 rows and 1 column for plotting
par(mfrow = c(3, 1))

# Loop through different degrees of freedom (df)
for (df in c(1, 3, 5)) {
  # Generate data
  x = cort(10000, df)
  t = rt(100000, df)
  
  # Find the appropriate x and y limits for the plots
  xlim_range <- range(c(x, t))
  ylim_range <- c(0, max(density(x)$y, density(t)$y))
  
  # Plot density for cort (blue) and t (red)
  plot(density(x), col = "blue", xlim = xlim_range, ylim = ylim_range, main = paste("df =", df))
  lines(density(t), col = "red")
}

2.3 Can we use correlation to map genes?

# Load data
myGD = read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt", head = T)
myGM = read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt", head = T)
source("http://zzlab.net/StaGen/2020/R/G2P.R")
source("http://zzlab.net/StaGen/2020/R/GWASbyCor.R")
# Subset the genetic data (myGD) by removing the first column
X = myGD[, -1]

# Phenotype simulation
set.seed(123)
mySim = G2P(X = X, h2 = .75, alpha = 1, NQTN = 10, distribution = "normal")

# GWAS by correlation
p = GWASbyCor(X = X, y = mySim$y) # Call the GWASbyCor function with phenotype (mySim$y) and genetic data (X) to get p-values
## Warning in cor(y, X): the standard deviation is zero
# Manhattan plot
# Create a color vector to alternate between different colors for each Chromosome (10 chromosomes total)
color.vector <- rep(c("deepskyblue", "orange", "forestgreen", "indianred3"), 10)
# number of SNPs
m = nrow(myGM) 
# Plot -log10(p) against SNPs
plot(t(-log10(p)) ~ seq(1:m), col = color.vector[myGM[, 2]]) # Color the points based on chromosome number

# True QTN position
abline(v = mySim$QTN.position, lty = 2, lwd = 2, col = "gray")
mySim$QTN.position
##  [1] 2463 2511 2227  526  195 2986 1842 1142 1253 1268
# Top 10 SNPs
top10 = order(p)[1:10]
top10
##  [1] 2227 1842  526   97  977   98 1041  244 1089 1765
p[top10]
##  [1] 2.918488e-11 1.000000e-10 1.003188e-08 6.590089e-07 2.582349e-06
##  [6] 3.263310e-06 3.355322e-06 3.594451e-06 7.422125e-06 1.025465e-05
# Add a horizontal line at the -log10(p) value corresponding to the largest p-value in the top 10
abline(h = -log10(max(p[top10])), lty = 1, lwd = 2, col = "black")

# Detect the QTNs that are found in the top 10 SNPs
detected = intersect(top10, mySim$QTN.position)
detected
## [1] 2227 1842  526
# Add vertical lines at the positions of the detected QTNs (using blue color)
abline(v = detected, lty = 2, lwd = 2, col = "blue")

# Identify the false positives in the top 10 (SNPs in top 10 that are not true QTNs)
falsePositive = setdiff(top10, mySim$QTN.position)
falsePositive
## [1]   97  977   98 1041  244 1089 1765
abline(v = falsePositive, lty = 2, lwd = 2, col = "red")

order() gives you the indices that would sort the vector, whereas sort() directly gives you the sorted vector.
True positives, false positives, and false negatives.

2.4 Let’s have more fun!

# Set the random seed for reproducibility, so the results are the same each time
set.seed(123456)
# Create a logical index based on the chromosome number
index1to5 = myGM[,2] < 6
# This is used to select first half genome (chr 1 to 5) from X
X1to5 = X[,index1to5]

# Simulate phenotype using the G2P function:
mySim = G2P(X = X1to5, h2 = .75, alpha = 1, NQTN = 10, distribution = "norm")
# Display the structure 
str(mySim)
## List of 7
##  $ addeffect   : num [1:10] 1 1 1 1 1 1 1 1 1 1
##  $ y           : num [1:281, 1] 8.04 7.64 11.13 7.46 7.47 ...
##  $ add         : num [1:281, 1] 7 6 8 6 8 10 10 8 10 10 ...
##  $ residual    : num [1:281] 1.043 1.64 3.127 1.46 -0.533 ...
##  $ QTN.position: int [1:10] 1084 234 1066 1905 1150 711 237 182 1252 742
##  $ SNPQ        : int [1:281, 1:10] 0 0 0 1 0 0 2 2 0 0 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:10] "PZB00228.6" "PZA02963.5" "PZA02888.3" "PZB02424.1" ...
##  $ int         : num [1:281, 1] 0 0 0 0 0 0 0 0 0 0 ...
# Plot all the markers on the chromosomes
plot(myGM[,c(2,3)])
# Overlay a red scatter plot at the positions of the QTNs 
lines(myGM[mySim$QTN.position, c(2, 3)], type = "p", col = "red")
# Highlight the QTN positions 
points(myGM[mySim$QTN.position, c(2, 3)], type = "p", col = "blue", cex = 5)

2.5 Association test by correlation

# GWAS function: using correlation to calculate p-values
GWASbyCor = function(X, y) {
  # Calculate the number of individuals in the dataset
  n = nrow(X)
  
  # Calculate the correlation between phenotype (y) and each SNP
  r = cor(y, X)
  
  # Calculate the t-statistic for each SNP using the correlation r
  t = r / sqrt((1 - r^2) / (n - 2))
  
  # Calculate the p-value for each SNP using the t-distribution
  p = 2 * (1 - pt(abs(t), n - 2))   # The pt function is used here to calculate the cumulative probability of obtaining a t-statistic at or below abs(t) with n - 2 degrees of freedom. 
  
  # Replace any zero p-values with a very small value (1e-10)
  zeros = p == 0
  p[zeros] = 1e-10
  
  # Return the calculated p-values
  return(p)
}