source("http://zzlab.net/GAPIT/gapit_functions.txt")
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
#myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)
pcs <- prcomp(myGD[,-1])
set.seed(99164)
mySim=GAPIT.Phenotype.Simulation(
GD=myGD,
GM=myGM,
h2=.7,
NQTN=20,
QTNDist="normal"
)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
# Generate folds for cross-validation
my_data <- data.frame(y = mySim$Y[,2], pcs$x[,1:3])
flds <- createFolds(my_data[,1], k=3, list = TRUE, returnTrain = FALSE)
# Perform 3-fold cross-validation with visualizations
for (i in 1:3) {
folds <- flds
test_idx <- folds[[i]]
folds[[i]] <- NULL
train_idx <- c(folds[[1]], folds[[2]])
trainData <- my_data[train_idx, ]
testData <- my_data[test_idx, ]
# Train model
lmModel <- train(y ~ PC1 + PC2 + PC3, data=trainData, method = "lm")
# Make predictions
preY <- predict(lmModel, testData)
# Compute accuracy (R²)
r2 <- cor(preY, testData$y)^2
print(paste("Fold", i, "R²:", round(r2, 3)))
# Visualization 1: Data split (Train vs. Test)
plot_data <- my_data
plot_data$Set <- "Train"
plot_data$Set[test_idx] <- "Test"
plot1 <- ggplot(plot_data, aes(x = PC1, y = y, color = Set)) +
geom_point(size = 3) +
scale_color_manual(values = c("Train" = "blue", "Test" = "red")) +
labs(title = paste("Fold", i, "- Train vs. Test Split"),
subtitle = "Red: Test Data, Blue: Train Data") +
theme_minimal()
print(plot1) # Explicitly print the plot
# Visualization 2: Predictions vs. Actual Values
results <- data.frame(Actual = testData$y, Predicted = preY)
plot2 <- ggplot(results, aes(x = Actual, y = Predicted)) +
geom_point(color = "darkgreen", size = 3) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "black") +
labs(title = paste("Fold", i, "- Predictions vs. Actual"),
subtitle = paste("R²:", round(r2, 3))) +
theme_minimal()
print(plot2) # Explicitly print the plot
}
## [1] "Fold 1 R²: 0.089"
## [1] "Fold 2 R²: 0"
## [1] "Fold 3 R²: 0.044"
### Data and simulation
source("http://zzlab.net/GAPIT/gapit_functions.txt")
#Import demo data
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
myCV=read.table(file="http://zzlab.net/GAPIT/data/mdp_env.txt",head=T)
X=myGD[,-1]
taxa=myGD[,1]
index1to5=myGM[,2]<6
X1to5 = X[,index1to5]
GD.candidate=cbind(as.data.frame(taxa),X1to5)
# simulation of phenotype
set.seed(99164)
mySim=GAPIT.Phenotype.Simulation(GD=GD.candidate,GM=myGM[index1to5,],h2=.5,NQTN=20, effectunit =.95,QTNDist="normal",CV=myCV,cveff=c(.2,.2),a2=.5,adim=3,category=1,r=.4)
myY=mySim$Y
colnames(myY)=c("Taxa","Trait1")
# Training set and testing set
nfold=5
# Another way to sample data
sets=sample(cut(1:nrow(myY),nfold,labels=FALSE),nrow(myY))
training=myY[,c(1,2)]
test_index=sets==1
training[test_index,2]=NA
testing=cbind(myY[test_index,c(1,2)], mySim$u[test_index])
myGAPIT5 <- GAPIT(
Y=training,
GD=myGD,
GM=myGM,
PCA.total=3,
CV=myCV,
model="gBLUP",
file.output=F
)
# Install the BLR package if not already installed
# install.packages("BLR")
# Load the BLR package for Bayesian LASSO regression
library(BLR)
# Define the number of iterations for the Bayesian algorithm
nIter = 10 # Total number of iterations for Markov Chain Monte Carlo (MCMC)
# Define the burn-in period
burnIn = 1500 # The number of initial iterations to discard to allow the algorithm to stabilize
# Extract Principal Component Analysis (PCA) results from GAPIT output
PCA = myGAPIT5$PCA # PCA components from GAPIT5 output
# Remove the first column (which usually contains sample names or IDs)
pcEnv = PCA[, -1] # Keep only the numerical PCA values
# Apply Bayesian LASSO Regression (BLR) model
myBLR = BLR(
y = as.matrix(myY[!test_index, 2]), # Response variable (phenotype), excluding test samples
XF = pcEnv[!test_index, -1], # Fixed effects (covariates) excluding test samples
XL = as.matrix(myGD[!test_index, -1]), # Marker data for training set
nIter = nIter, # Number of MCMC iterations
burnIn = burnIn # Burn-in period
)
## ======== Bayesian Regression Coupled with LASSO ========
## # #
## # BLR v1.6 #
## # January, 2020 #
## # Contact: #
## # Gustavo de los Campos, gdeloscampos@gmail.com #
## # Paulino Perez-Rodriguez,perpdgo@colpos.mx #
## # #
## =========================================================
## ===============================================================
## No prior was provided, BLR is running with improper priors.
## ===============================================================
## Iter: 1 time/iter: 0.024 varE: 23.682 lambda: 59.904
## ------------------------------------------------------------
## Iter: 2 time/iter: 0.003 varE: 22.075 lambda: 60.613
## ------------------------------------------------------------
## Iter: 3 time/iter: 0.008 varE: 22.62 lambda: 60.103
## ------------------------------------------------------------
## Iter: 4 time/iter: 0.003 varE: 24.552 lambda: 59.075
## ------------------------------------------------------------
## Iter: 5 time/iter: 0.002 varE: 25.503 lambda: 59.673
## ------------------------------------------------------------
## Iter: 6 time/iter: 0.002 varE: 25.436 lambda: 59.293
## ------------------------------------------------------------
## Iter: 7 time/iter: 0.007 varE: 25.656 lambda: 59.501
## ------------------------------------------------------------
## Iter: 8 time/iter: 0.002 varE: 26.535 lambda: 59.133
## ------------------------------------------------------------
## Iter: 9 time/iter: 0.003 varE: 26.118 lambda: 59.765
## ------------------------------------------------------------
## Iter: 10 time/iter: 0.003 varE: 25.611 lambda: 59.978
## ------------------------------------------------------------
# --- Model Performance on Training Set ---
# Predict phenotype values for training data using estimated marker effects
pred.ref = as.matrix(myGD[!test_index, -1]) %*% myBLR$bL
# Compute correlation between observed and predicted values in the training set
modelfit <- cor(myY[!test_index, 2], pred.ref) # This gives R² (model fit)
print(modelfit)
## [,1]
## [1,] NA
# --- Model Performance on Testing Set ---
# Predict phenotype values for test data using estimated marker effects
pred.inf = as.matrix(myGD[test_index, -1]) %*% myBLR$bL
# Compute accuracy (correlation between observed and predicted values in the test set)
accuracy <- cor(myY[test_index, 2], pred.inf) # This gives R² for test set
print(accuracy)
## [,1]
## [1,] NA
# Install the BGLR package if not already installed
# install.packages("BGLR")
# Load the BGLR package for Bayesian Genomic Regression
library(BGLR)
# Define the number of iterations for the Bayesian algorithm
nIter = 10 # Total number of iterations for Markov Chain Monte Carlo (MCMC)
# Define the burn-in period (initial iterations discarded)
burnIn = 1500 # The number of initial iterations to allow the algorithm to stabilize
# Run the BGLR model for genomic prediction
myBGLR = BGLR(
y = as.matrix(myY[!test_index, 2]), # Response variable (phenotype) for training data
ETA = list(
list(
X = pcEnv[!test_index, -1], # Fixed effects (e.g., environmental variables or principal components)
model = 'FIXED', # Use a fixed effect model for these variables
saveEffects = TRUE # Save the estimated effects for later use
),
list(
X = as.matrix(myGD[!test_index, -1]), # Genotypic marker data for training set
model = 'BRR', # Bayesian Ridge Regression (BRR) model for marker effects
saveEffects = TRUE # Save the estimated effects
)
# Uncomment the following lines to experiment with different Bayesian models:
# list(X = as.matrix(myGD[!test_index, -1]), model = 'BL', saveEffects = TRUE), # Bayesian Lasso (BL)
# list(X = as.matrix(myGD[!test_index, -1]), model = 'BayesA', saveEffects = TRUE), # BayesA
# list(X = as.matrix(myGD[!test_index, -1]), model = 'BayesB', saveEffects = TRUE) # BayesB
),
nIter = nIter, # Number of MCMC iterations
burnIn = burnIn # Burn-in period to remove early unstable iterations
)
# --- Model Performance on Training Set ---
# Predict phenotype values for the training set using estimated marker effects
pred.ref = as.matrix(myGD[!test_index, -1]) %*% myBGLR$ETA[[2]]$b
# Compute correlation between observed and predicted values in the training set (R² for model fit)
modelfit <- cor(myY[!test_index, 2], pred.ref)
print(modelfit)
# --- Model Performance on Testing Set ---
# Predict phenotype values for the test set using estimated marker effects
pred.inf = as.matrix(myGD[test_index, -1]) %*% myBGLR$ETA[[2]]$b
# Compute correlation between observed and predicted values in the test set (R² for model accuracy)
accuracy <- cor(myY[test_index, 2], pred.inf)
print(accuracy)