# A function that will filter a genotype matrix based on maf and missingness # Calculates the proportion of missing data for every marker in a genotype matrix (mising data is NA) calc_missrate <- function(gt_mat) { col_func <- function(gt_col) { missrate <- sum(is.na(gt_col)) / length(gt_col) return(missrate) } missrate_vect <- apply(gt_mat, 2, col_func) return(missrate_vect) } # Calculates the minor allele frequency for every marker in a genotype matrix (coded as c(-1,0,1)) calc_maf_apply <- function(gt_mat, encoding = c(-1, 0, 1)) { col_func1 <- function(gt_col) { allele1_ct <- (sum(gt_col == -1, na.rm = T) * 2) + sum(gt_col == 0, na.rm = T) allele2_ct <- (sum(gt_col == 1, na.rm = T) * 2) + sum(gt_col == 0, na.rm = T) maf <- min(c(allele1_ct, allele2_ct)) / (sum(!is.na(gt_col))*2) } col_func2 <- function(gt_col) { allele1_ct <- (sum(gt_col == 0, na.rm = T) * 2) + sum(gt_col == 1, na.rm = T) allele2_ct <- (sum(gt_col == 2, na.rm = T) * 2) + sum(gt_col == 1, na.rm = T) maf <- min(c(allele1_ct, allele2_ct)) / (sum(!is.na(gt_col))*2) } if (all(encoding == c(-1, 0, 1))) { maf_vect <- apply(gt_mat, 2, col_func1) } else if (all(encoding == c(0, 1, 2))) { maf_vect <- apply(gt_mat, 2, colfunc2) } else { print('Encoding not recognized, returning NULL') maf_vect <- NULL } return(maf_vect) } # This is a function that will split data into a list of k-folds make_CV_sets <- function(response, predictors, k = 5) { rand_values <- rnorm(length(response)) k_quantiles <- quantile(rand_values, 0:k/k) k_assign <- cut(rand_values, k_quantiles, include.lowest = T, labels = F) cv_list <- list() for (i in 1:k) { train_x <- data.frame(predictors[k_assign != i,]) train_y <- response[k_assign != i] test_x <- data.frame(predictors[k_assign == i,]) test_y <- response[k_assign == i] fold <- list(train_x, train_y, test_x, test_y) names(fold) <- c("train_x", "train_y", "test_x", "test_y") cv_list[[i]] <- fold } return(cv_list) } # This function takes in a set of folds from make_CV_sets(), builds a GLM model using the training set, and calculates model accuracy (r^2) using a testing set lm_jank <- function(cv_sets) { validation_results <- sapply(cv_sets, function(fold) { train_data <- cbind(fold$train_y, fold$train_x) names(train_data)[1] <- "y" # Calculate the model train_model <- lm(y ~ ., data = train_data) # Use the model to predict the test set test_predict <- predict(train_model, fold$test_x) # Correlate test predicted vs observed and square to get r-squared accuracy <- cor(fold$test_y, test_predict)^2 return(accuracy) }) }