Simulation with GAPIT

source("http://zzlab.net/GAPIT/gapit_functions.txt") 
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) 
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
#myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)
pcs <- prcomp(myGD[,-1])

set.seed(99164) 
mySim=GAPIT.Phenotype.Simulation(
  GD=myGD, 
  GM=myGM,
  h2=.7,
  NQTN=20, 
  QTNDist="normal"
  )

Cross validation

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)

# Generate folds for cross-validation
my_data <- data.frame(y = mySim$Y[,2], pcs$x[,1:3])
flds <- createFolds(my_data[,1], k=3, list = TRUE, returnTrain = FALSE)

# Perform 3-fold cross-validation with visualizations
for (i in 1:3) {
  folds <- flds
  test_idx <- folds[[i]]
  folds[[i]] <- NULL
  train_idx <- c(folds[[1]], folds[[2]])

  trainData <- my_data[train_idx, ]
  testData  <- my_data[test_idx, ]

  # Train model
  lmModel <- train(y ~ PC1 + PC2 + PC3, data=trainData, method = "lm")

  # Make predictions
  preY <- predict(lmModel, testData)

  # Compute accuracy (R²)
  r2 <- cor(preY, testData$y)^2
  print(paste("Fold", i, "R²:", round(r2, 3)))

  # Visualization 1: Data split (Train vs. Test)
  plot_data <- my_data
  plot_data$Set <- "Train"
  plot_data$Set[test_idx] <- "Test"

  plot1 <- ggplot(plot_data, aes(x = PC1, y = y, color = Set)) +
    geom_point(size = 3) +
    scale_color_manual(values = c("Train" = "blue", "Test" = "red")) +
    labs(title = paste("Fold", i, "- Train vs. Test Split"),
         subtitle = "Red: Test Data, Blue: Train Data") +
    theme_minimal()
  
  print(plot1)  # Explicitly print the plot

  # Visualization 2: Predictions vs. Actual Values
  results <- data.frame(Actual = testData$y, Predicted = preY)
  
  plot2 <- ggplot(results, aes(x = Actual, y = Predicted)) +
    geom_point(color = "darkgreen", size = 3) +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "black") +
    labs(title = paste("Fold", i, "- Predictions vs. Actual"),
         subtitle = paste("R²:", round(r2, 3))) +
    theme_minimal()

  print(plot2)  # Explicitly print the plot
}
## [1] "Fold 1 R²: 0.089"

## [1] "Fold 2 R²: 0"

## [1] "Fold 3 R²: 0.044"

### Data and simulation

source("http://zzlab.net/GAPIT/gapit_functions.txt")

#Import demo data
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)

X=myGD[,-1]
taxa=myGD[,1]
index1to5=myGM[,2]<6
X1to5 = X[,index1to5]
GD.candidate=cbind(as.data.frame(taxa),X1to5)

# simulation of phenotype
set.seed(99164)
mySim=GAPIT.Phenotype.Simulation(GD=GD.candidate,GM=myGM[index1to5,],h2=.5,NQTN=20, effectunit =.95,QTNDist="normal",CV=myCV,cveff=c(.2,.2),a2=.5,adim=3,category=1,r=.4)
myY=mySim$Y
colnames(myY)=c("Taxa","Trait1")

# Training set and testing set
nfold=5
# Another way to sample data
sets=sample(cut(1:nrow(myY),nfold,labels=FALSE),nrow(myY))
training=myY[,c(1,2)]
test_index=sets==1
training[test_index,2]=NA
testing=cbind(myY[test_index,c(1,2)], mySim$u[test_index])

gBLUP through GAPIT

myGAPIT5 <- GAPIT(
  Y=training,
  GD=myGD,
  GM=myGM,
  PCA.total=3,
  CV=myCV,
  model="gBLUP",
  file.output=F
  )

BLR

# Install the BLR package if not already installed
# install.packages("BLR")

# Load the BLR package for Bayesian LASSO regression
library(BLR)

# Define the number of iterations for the Bayesian algorithm
nIter = 10  # Total number of iterations for Markov Chain Monte Carlo (MCMC)

# Define the burn-in period
burnIn = 1500  # The number of initial iterations to discard to allow the algorithm to stabilize

# Extract Principal Component Analysis (PCA) results from GAPIT output
PCA = myGAPIT5$PCA  # PCA components from GAPIT5 output

# Remove the first column (which usually contains sample names or IDs)
pcEnv = PCA[, -1]  # Keep only the numerical PCA values

# Apply Bayesian LASSO Regression (BLR) model
myBLR = BLR(
  y = as.matrix(myY[!test_index, 2]),  # Response variable (phenotype), excluding test samples
  XF = pcEnv[!test_index, -1],         # Fixed effects (covariates) excluding test samples
  XL = as.matrix(myGD[!test_index, -1]), # Marker data for training set
  nIter = nIter,    # Number of MCMC iterations
  burnIn = burnIn   # Burn-in period
)
## ========  Bayesian Regression Coupled with LASSO ========
## #                                                       #
## #                    BLR v1.6                           #
## #                   January, 2020                       #
## #      Contact:                                         #
## #      Gustavo de los Campos, gdeloscampos@gmail.com    #
## #    Paulino Perez-Rodriguez,perpdgo@colpos.mx        #
## #                                                       #
## =========================================================
## ===============================================================
## No prior was provided, BLR is running with improper priors.
## ===============================================================
## Iter:  1 time/iter:  0.024 varE:  23.682 lambda:  59.904
## ------------------------------------------------------------
## Iter:  2 time/iter:  0.003 varE:  22.075 lambda:  60.613
## ------------------------------------------------------------
## Iter:  3 time/iter:  0.008 varE:  22.62 lambda:  60.103
## ------------------------------------------------------------
## Iter:  4 time/iter:  0.003 varE:  24.552 lambda:  59.075
## ------------------------------------------------------------
## Iter:  5 time/iter:  0.002 varE:  25.503 lambda:  59.673
## ------------------------------------------------------------
## Iter:  6 time/iter:  0.002 varE:  25.436 lambda:  59.293
## ------------------------------------------------------------
## Iter:  7 time/iter:  0.007 varE:  25.656 lambda:  59.501
## ------------------------------------------------------------
## Iter:  8 time/iter:  0.002 varE:  26.535 lambda:  59.133
## ------------------------------------------------------------
## Iter:  9 time/iter:  0.003 varE:  26.118 lambda:  59.765
## ------------------------------------------------------------
## Iter:  10 time/iter:  0.003 varE:  25.611 lambda:  59.978
## ------------------------------------------------------------
# --- Model Performance on Training Set ---
# Predict phenotype values for training data using estimated marker effects
pred.ref = as.matrix(myGD[!test_index, -1]) %*% myBLR$bL

# Compute correlation between observed and predicted values in the training set
modelfit <- cor(myY[!test_index, 2], pred.ref)  # This gives R² (model fit)
print(modelfit)
##      [,1]
## [1,]   NA
# --- Model Performance on Testing Set ---
# Predict phenotype values for test data using estimated marker effects
pred.inf = as.matrix(myGD[test_index, -1]) %*% myBLR$bL

# Compute accuracy (correlation between observed and predicted values in the test set)
accuracy <- cor(myY[test_index, 2], pred.inf)  # This gives R² for test set
print(accuracy)
##      [,1]
## [1,]   NA

BGLR

# Install the BGLR package if not already installed
# install.packages("BGLR")

# Load the BGLR package for Bayesian Genomic Regression
library(BGLR)

# Define the number of iterations for the Bayesian algorithm
nIter = 10  # Total number of iterations for Markov Chain Monte Carlo (MCMC)

# Define the burn-in period (initial iterations discarded)
burnIn = 1500  # The number of initial iterations to allow the algorithm to stabilize

# Run the BGLR model for genomic prediction
myBGLR = BGLR(
  y = as.matrix(myY[!test_index, 2]),  # Response variable (phenotype) for training data
  ETA = list(
    list(
      X = pcEnv[!test_index, -1],   # Fixed effects (e.g., environmental variables or principal components)
      model = 'FIXED',              # Use a fixed effect model for these variables
      saveEffects = TRUE            # Save the estimated effects for later use
    ),
    list(
      X = as.matrix(myGD[!test_index, -1]),  # Genotypic marker data for training set
      model = 'BRR',                         # Bayesian Ridge Regression (BRR) model for marker effects
      saveEffects = TRUE                     # Save the estimated effects
    )
    # Uncomment the following lines to experiment with different Bayesian models:
    # list(X = as.matrix(myGD[!test_index, -1]), model = 'BL', saveEffects = TRUE),   # Bayesian Lasso (BL)
    # list(X = as.matrix(myGD[!test_index, -1]), model = 'BayesA', saveEffects = TRUE), # BayesA
    # list(X = as.matrix(myGD[!test_index, -1]), model = 'BayesB', saveEffects = TRUE)  # BayesB
  ),
  nIter = nIter,    # Number of MCMC iterations
  burnIn = burnIn   # Burn-in period to remove early unstable iterations
)

# --- Model Performance on Training Set ---
# Predict phenotype values for the training set using estimated marker effects
pred.ref = as.matrix(myGD[!test_index, -1]) %*% myBGLR$ETA[[2]]$b

# Compute correlation between observed and predicted values in the training set (R² for model fit)
modelfit <- cor(myY[!test_index, 2], pred.ref)
print(modelfit)

# --- Model Performance on Testing Set ---
# Predict phenotype values for the test set using estimated marker effects
pred.inf = as.matrix(myGD[test_index, -1]) %*% myBGLR$ETA[[2]]$b

# Compute correlation between observed and predicted values in the test set (R² for model accuracy)
accuracy <- cor(myY[test_index, 2], pred.inf)
print(accuracy)