# Set up 4x4 plotting layout
par(mfrow = c(4, 4))
# Function to generate and plot data
plot_gene_expression <- function(m, n) {
x = runif(m) # Generate gene effects between 0 and 1
gene = matrix(x, n, m, byrow = TRUE) # Create gene matrix with identical rows
# Generate random binary gene expression data
galton = matrix(runif(n * m), n, m)
galton.binary = galton < 0.5 # Convert to binary (dominant/recessive)
# Modify gene matrix
gene[galton.binary] = 0 # Set genes to 0 where recessive
# Calculate total gene expression per individual
y = rowSums(gene)
# Plot histogram
hist(y, main = paste("Histogram (m =", m, ")"), col = "skyblue", border = "white")
# Plot density
plot(density(y), main = paste("Density Plot (m =", m, ")"), col = "blue", lwd = 2)
}
# Generate plots for m = 10
for (i in 1:8) {
plot_gene_expression(m = 10, n = 1000)
}
# Generate plots for m = 100
for (i in 1:8) {
plot_gene_expression(m = 100, n = 1000)
}
Components of Phenotypic Variation
The phenotype (\(Y\)) of an individual is influenced by genetic and environmental factors, as well as their interactions and residual effects:
\[ Y = G + E + G \times E + \text{Residual} \]
where:
\(G\) (Genetic effect) consists of:
\(E\) (Environmental effect): External factors such as year and location.
\(G \times E\) (Genotype-by-environment interaction): The variation in genetic effects across different environments.
Residual: Includes unexplained variation such as measurement error.
# Load Data
myGD = read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt", head = T)
myGM = read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt", head = T)
# Subset the genetic data (myGD) by removing the first column
X = myGD[, -1]
G2P = function(X, h2, alpha, NQTN, distribution) {
n = nrow(X) # Number of individuals (rows)
m = ncol(X) # Number of SNP markers (columns)
# Sampling QTN
QTN.position = sample(m, NQTN, replace = F) # NQTN: number of QTNs
SNPQ = as.matrix(X[, QTN.position]) # Genotype of allt the QTNs
QTN.position
# QTN effects
if (distribution == "norm") {
addeffect = rnorm(NQTN, 0, 1) # The gene effects of the QTNs are drawn from a normal distribution with mean 0 and variance 1
} else {
addeffect = alpha^(1:NQTN) # Otherwise, the gene effects follow Geometry distribution
}
# Simulate phenotype
effect = SNPQ %*% addeffect # Calculates genetic/additive effect
effectvar = var(effect)
residualvar = (effectvar - h2 * effectvar) / h2 # Calculates the residual variance based on the given heritability formula
residual = rnorm(n, 0, sqrt(residualvar)) # Simulates residual effects based on its variance
y = effect + residual # The final phenotype (y) is computed by adding the genetic effect and residual noise
return(list(addeffect = addeffect, y = y, add = effect, residual = residual, QTN.position = QTN.position, SNPQ = SNPQ))
}
set.seed(1234567)
# Set up the plotting area with 3 rows and 1 column of plots
# Adjust margins: bottom (3), left (4), top (1), right (1)
par(mfrow = c(1, 3), mar = c(3, 4, 1, 1))
# Run G2P function
myG2P = G2P(X, .75, 1, 10, "norm") # Simulating phenotype with h2 = 0.75, alpha = 1, NQTN = 10
str(myG2P) # Display structure of the result
## List of 6
## $ addeffect : num [1:10] 0.91 -0.919 -0.158 1.102 1.542 ...
## $ y : num [1:281, 1] 1.269 -1.574 0.126 -1.116 -5.015 ...
## $ add : num [1:281, 1] 1.108 -1.3807 0.0654 -0.0253 -5.8364 ...
## $ residual : num [1:281] 0.1607 -0.1933 0.0609 -1.0907 0.821 ...
## $ QTN.position: int [1:10] 2556 2638 1947 1148 2789 1697 2688 59 2471 1088
## $ SNPQ : int [1:281, 1:10] 2 0 2 2 0 2 0 2 0 2 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:10] "PZA03610.1" "PZA03593.1" "PZA00500.3" "PZB01473.3" ...
# Calculate variance components: additive, residual, and total phenotype
va = var(myG2P$add) # Additive genetic variance
ve = var(myG2P$residual) # Environmental (residual) variance
vp = var(myG2P$y) # Total phenotypic variance
# Store variance components in a matrix
v = matrix(c(va, ve, vp), 1, 3)
colnames(v) = c("A", "E", "P") # Label columns: A = Additive, E = Environmental, P = Phenotypic
# Create a bar plot of variance components for h2 = 0.75
barplot(v, col = "gray")
# Repeat the process for h2 = 0.5
myG2P = G2P(X, .5, 1, 10, "norm") # Simulating phenotype with h2 = 0.5
va = var(myG2P$add)
ve = var(myG2P$residual)
vp = var(myG2P$y)
# Store variance components in a matrix
v = matrix(c(va, ve, vp), 1, 3)
colnames(v) = c("A", "E", "P")
# Create a bar plot of variance components for h2 = 0.5
barplot(v, col = "gray")
# Repeat the process for h2 = 0.25
myG2P = G2P(X, .25, 1, 10, "norm") # Simulating phenotype with h2 = 0.25
va = var(myG2P$add)
ve = var(myG2P$residual)
vp = var(myG2P$y)
# Store variance components in a matrix
v = matrix(c(va, ve, vp), 1, 3)
colnames(v) = c("A", "E", "P")
# Create a bar plot of variance components for h2 = 0.25
barplot(v, col = "gray")
set.seed(1234567)
# Check gene effect distribution with different alpha values
par(mfrow = c(3, 3), mar = c(3, 4, 1, 1)) # Set up a 3x3 plotting layout
# Define alpha values and corresponding colors
alpha_values <- c(1, 0.95, 0.5)
effect_types <- c("Additive Effect", "Residual Effect", "Total Phenotype")
colors <- c("deepskyblue3", "tomato3", "black") # Distinct colors for each effect type
# Loop through different alpha values and generate plots
for (alpha in alpha_values) {
myG2P = G2P(X, h2 = 0.5, alpha = alpha, NQTN = 10, distribution = "geom")
# Extract components and plot histograms
data_list <- list(myG2P$add, myG2P$residual, myG2P$y)
for (i in 1:3) {
hist(
data_list[[i]],
main = paste(effect_types[i], "(α =", alpha, ")"),
col = colors[i],
border = "gray",
xlab = "Value",
ylab = "Frequency"
)
}
}
The trend reflects how well additive genetic effects explain phenotypic variation. Larger gene effects tighten the alignment between phenotype and additive effect, while smaller effects inflate residual variance.
set.seed(1234567890)
# Check on distribution
par(mfrow = c(3, 3), mar = c(3, 4, 1, 1)) # Set up a 3x3 plotting layout
# Define QTN values and labels
QTN_values <- c(2, 10, 100)
effect_types <- c("Additive Effect", "Residual Effect", "Total Phenotype")
# Colors for better visualization
colors <- c("deepskyblue3", "tomato3", "black")
# Loop through different QTN values and generate histograms
for (QTN in QTN_values) {
myG2P = G2P(X, 0.5, 1, QTN, "norm") # Simulate phenotype
# Extract components and plot histograms
data_list <- list(myG2P$add, myG2P$residual, myG2P$y)
for (i in 1:3) {
hist(
data_list[[i]],
main = paste(effect_types[i], "(", QTN, "QTNs)"),
col = colors[i],
border = "gray",
xlab = "Value",
ylab = "Frequency"
)
}
}
# Set up density plot layout
par(mfrow = c(3, 1))
# Loop through different QTN values and generate density plots
for (QTN in QTN_values) {
myG2P = G2P(X, 0.5, 1, QTN, "norm") # Simulate phenotype
# Compute densities
dens_y <- density(myG2P$y)
dens_add <- density(myG2P$add)
dens_residual <- density(myG2P$residual)
# Determine maximum and minimum values across all densities for proper scaling
max_density <- max(dens_y$y, dens_add$y, dens_residual$y)
min_density <- min(dens_y$y, dens_add$y, dens_residual$y)
# Determine maximum and minimum x-values across all densities
max_x <- max(dens_y$x, dens_add$x, dens_residual$x)
min_x <- min(dens_y$x, dens_add$x, dens_residual$x)
# Set dynamic xlim and ylim to fit all density curves
xlim_range <- c(min_x, max_x)
ylim_range <- c(min_density, max_density * 1.1) # Add slight margin for clarity
# Plot density with dynamically scaled xlim and ylim
plot(
dens_y,
xlim = xlim_range, # Set dynamic xlim
ylim = ylim_range, # Set dynamic ylim
main = paste(QTN, "QTNs"),
col = "gray",
lwd = 2
)
# Add lines for different genetic effects
lines(dens_add, col = "deepskyblue3", lwd = 2) # Additive effect
lines(dens_residual, col = "tomato3", lwd = 2) # Residual effect
}
As the number of QTNs increases, the additive effect (blue) becomes a more prominent predictor of the phenotype (black), while the residual (red) decreases, indicating that a greater proportion of the phenotypic variation is explained by the genetic model. This trend suggests that incorporating more QTNs enhances the model’s ability to better capture the genetic architecture underlying the trait.
# Simulate the t-statistics for correlations:
cort = function(n=10000, df=100) {
z = replicate(n, {
x = rnorm(df + 2) # Generate a random sample of size df + 2 from a normal distribution
y = rnorm(df + 2) # Generate another random sample of size df + 2 from a normal distribution
r = cor(x, y) # Compute the Pearson correlation coefficient between x and y
t = r / sqrt((1 - r^2) / df) # Calculate the t-statistic for the correlation
})
return(z) # Return the vector z contains the t-statistics for all n iterations and is returned.
}
# Simulate t-statistics (x) from random data:
x = cort(10000, 5)
# Generate the theoretical t-distribution
t = rt(100000, 5)
# Visualize the distributions
plot(density(x), col = "blue")
lines(density(t), col = "red")
By comparing the two distributions, you can observe how well the simulated t-statistics match the theoretical t-distribution. In this case, the simulated t-statistics should resemble the t-distribution with 5 degrees of freedom.
set.seed(1234567)
# Set up 3 rows and 1 column for plotting
par(mfrow = c(3, 1))
# Loop through different degrees of freedom (df)
for (df in c(1, 3, 5)) {
# Generate data
x = cort(10000, df)
t = rt(100000, df)
# Find the appropriate x and y limits for the plots
xlim_range <- range(c(x, t))
ylim_range <- c(0, max(density(x)$y, density(t)$y))
# Plot density for cort (blue) and t (red)
plot(density(x), col = "blue", xlim = xlim_range, ylim = ylim_range, main = paste("df =", df))
lines(density(t), col = "red")
}
# Load data
myGD = read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt", head = T)
myGM = read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt", head = T)
source("http://zzlab.net/StaGen/2020/R/G2P.R")
source("http://zzlab.net/StaGen/2020/R/GWASbyCor.R")
# Subset the genetic data (myGD) by removing the first column
X = myGD[, -1]
# Phenotype simulation
set.seed(123)
mySim = G2P(X = X, h2 = .75, alpha = 1, NQTN = 10, distribution = "normal")
# GWAS by correlation
p = GWASbyCor(X = X, y = mySim$y) # Call the GWASbyCor function with phenotype (mySim$y) and genetic data (X) to get p-values
## Warning in cor(y, X): the standard deviation is zero
# Manhattan plot
# Create a color vector to alternate between different colors for each Chromosome (10 chromosomes total)
color.vector <- rep(c("deepskyblue", "orange", "forestgreen", "indianred3"), 10)
# number of SNPs
m = nrow(myGM)
# Plot -log10(p) against SNPs
plot(t(-log10(p)) ~ seq(1:m), col = color.vector[myGM[, 2]]) # Color the points based on chromosome number
# True QTN position
abline(v = mySim$QTN.position, lty = 2, lwd = 2, col = "gray")
mySim$QTN.position
## [1] 2463 2511 2227 526 195 2986 1842 1142 1253 1268
# Top 10 SNPs
top10 = order(p)[1:10]
top10
## [1] 2227 1842 526 97 977 98 1041 244 1089 1765
p[top10]
## [1] 2.918488e-11 1.000000e-10 1.003188e-08 6.590089e-07 2.582349e-06
## [6] 3.263310e-06 3.355322e-06 3.594451e-06 7.422125e-06 1.025465e-05
# Add a horizontal line at the -log10(p) value corresponding to the largest p-value in the top 10
abline(h = -log10(max(p[top10])), lty = 1, lwd = 2, col = "black")
# Detect the QTNs that are found in the top 10 SNPs
detected = intersect(top10, mySim$QTN.position)
detected
## [1] 2227 1842 526
# Add vertical lines at the positions of the detected QTNs (using blue color)
abline(v = detected, lty = 2, lwd = 2, col = "blue")
# Identify the false positives in the top 10 (SNPs in top 10 that are not true QTNs)
falsePositive = setdiff(top10, mySim$QTN.position)
falsePositive
## [1] 97 977 98 1041 244 1089 1765
abline(v = falsePositive, lty = 2, lwd = 2, col = "red")
order() gives you the indices that would sort the vector, whereas
sort() directly gives you the sorted vector.
True positives, false positives, and false negatives.
# Set the random seed for reproducibility, so the results are the same each time
set.seed(123456)
# Create a logical index based on the chromosome number
index1to5 = myGM[,2] < 6
# This is used to select first half genome (chr 1 to 5) from X
X1to5 = X[,index1to5]
# Simulate phenotype using the G2P function:
mySim = G2P(X = X1to5, h2 = .75, alpha = 1, NQTN = 10, distribution = "norm")
# Display the structure
str(mySim)
## List of 7
## $ addeffect : num [1:10] 1 1 1 1 1 1 1 1 1 1
## $ y : num [1:281, 1] 8.04 7.64 11.13 7.46 7.47 ...
## $ add : num [1:281, 1] 7 6 8 6 8 10 10 8 10 10 ...
## $ residual : num [1:281] 1.043 1.64 3.127 1.46 -0.533 ...
## $ QTN.position: int [1:10] 1084 234 1066 1905 1150 711 237 182 1252 742
## $ SNPQ : int [1:281, 1:10] 0 0 0 1 0 0 2 2 0 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:10] "PZB00228.6" "PZA02963.5" "PZA02888.3" "PZB02424.1" ...
## $ int : num [1:281, 1] 0 0 0 0 0 0 0 0 0 0 ...
# Plot all the markers on the chromosomes
plot(myGM[,c(2,3)])
# Overlay a red scatter plot at the positions of the QTNs
lines(myGM[mySim$QTN.position, c(2, 3)], type = "p", col = "red")
# Highlight the QTN positions
points(myGM[mySim$QTN.position, c(2, 3)], type = "p", col = "blue", cex = 5)
# GWAS function: using correlation to calculate p-values
GWASbyCor = function(X, y) {
# Calculate the number of individuals in the dataset
n = nrow(X)
# Calculate the correlation between phenotype (y) and each SNP
r = cor(y, X)
# Calculate the t-statistic for each SNP using the correlation r
t = r / sqrt((1 - r^2) / (n - 2))
# Calculate the p-value for each SNP using the t-distribution
p = 2 * (1 - pt(abs(t), n - 2)) # The pt function is used here to calculate the cumulative probability of obtaining a t-statistic at or below abs(t) with n - 2 degrees of freedom.
# Replace any zero p-values with a very small value (1e-10)
zeros = p == 0
p[zeros] = 1e-10
# Return the calculated p-values
return(p)
}