###ALPACA package### #Authors: Haixiao Dong and Ryan Oliveira #Date:March, 2017 ###ALPACA function ##input: y, X, C, PCA.u #n is number of individuals, m is number of markers, t is nuber of covariates #y -- phenotype, n by 1 dimension #X -- genotype, n by m dimension #C -- covariate, n by t dimension, default: NULL #myGM -- SNP positions #PCA.M -- Major PCAs used as co-factors, default: c(1,2,3) #QTN.position -- simulated QTN positions, default: NULL #cutoff -- choose p cutoff (e.g. 10^-6), default: alpha = 0.05 with Bonferroni correction ###output a list contain: #P.value -- probality values, 1 by m dimesion #cutoff.final -- cutoff value finally used #sig.SNP -- significant SNPs under final cutoff #sig.SNP.P -- P values corresponding to the sig.SNP #order.SNP -- All SNPs based on the ascending order of P.values #power.fdr.type1error -- Power, fdr, type I error corresponding to order.SNP ALPACA <- function(y=NULL, X=NULL, C=NULL, GM=NULL, PCA.M=1:3, QTN.position=NULL, cutoff=NULL){ print("Welcome to using ALPACA!") ###check and copy input data GD=X CV=C PCA=prcomp(GD) ##automatically eliminate own co-factors if linear dependent to the covariates provided by users fD=NULL if (!is.null(CV)){ fD <- fixDependence(as.matrix(CV), PCA$x[, PCA.M]) } if (length(fD)>0){ PCA.M = c() print("Linear dependence exists, own co-factors (PCA) eliminated!") } else { write.csv(x=PCA$x[, PCA.M], file="ALPACA.PCA.csv") pdf(file="ALPACA.pdf", onefile=TRUE) plot(PCA$x[, 1], PCA$x[, 2], xlab="PCA1", ylab="PCA2", col="red") plot(PCA$x[, 1], PCA$x[, 3], xlab="PCA1", ylab="PCA3", col="red") plot(PCA$x[, 2], PCA$x[, 3], xlab="PCA2", ylab="PCA3", col="red") print("PCA[, 1:3] used as default, plots and table generated!") dev.off() } #P values calculation, from zhiwu's slides n=nrow(GD) m=ncol(GD) P=matrix(NA,1,m) for (i in 1:m){ x=GD[, i] if(max(x)==min(x)){ p=1}else{ X=as.matrix(cbind(1, CV, PCA$x[,PCA.M], x)) LHS=t(X)%*%X C=solve(LHS) RHS=t(X)%*%y b=C%*%RHS yb=X%*%b e=y-yb n=length(y) ve=sum(e^2)/(n-1) vt=C*ve t=b/sqrt(diag(vt)) p=2*(1-pt(abs(t),n-2)) } #end of testing variation P[i]=p[length(p)] } #end of looping for markers P.value=P order.SNP=order(P.value) #If cutoff is default, uses Bonferroni; else uses -log(value) cutoff.final=(ifelse( is.null(cutoff), 0.05/m, cutoff )) sig.SNP <- order.SNP[sort(P.value)<=cutoff.final] ###power.fdr.type1error calculation power.fdr.type1error=NULL if (!is.null(QTN.position)){ power.fdr.type1error=power.fdr(order.SNP, QTN.position) } ### zeros=P==0 P[zeros]=1e-20 ###Generate Manhattan plot #color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10) color.vector <- rep(rainbow(5), length(unique(GM[,2]))) pdf(file="GLM.Manhattan.pdf") plot(t(-log10(P))~seq(1:m),col=color.vector[GM[,2]], xlab="", ylab="-log10(p)") if (!is.null(QTN.position)){ points(QTN.position, -log10(P[QTN.position]), type="p", pch=20, cex=1,lwd=1.5,col="dimgrey") points(QTN.position, -log10(P[QTN.position]), type="p", pch=21, cex=2,lwd=1.5,col="dimgrey") } abline(h=-log10(cutoff.final), col="green") print("Manhattan plot generated!") dev.off() ###Generate QQ plot p.obs=P m2=length(p.obs) p.uni=runif(m2,0,1) order.obs=order(p.obs) order.uni=order(p.uni) pdf(file="GLM.QQ.pdf") plot(-log10(p.uni[order.uni]), -log10(p.obs[order.obs]), xlab="Expected -log10(p)", ylab="Observed -log10(p)") abline(a=0, b=1, col="red") print("QQ plot generated!") dev.off() ###Writes SNP names and p-values as a two-column, unlabeled, space-delimited text file myGLM.results = matrix((c(row.names(t(myGD))[-1],myGM[,2],myGM[,3],P.value,rank(P.value))),nrow=3093,ncol=5) colnames(myGLM.results)=c("SNP ","Chromosome ","Position ","P value ","Order ") write.csv(myGLM.results,file="myGLM.results.csv",row.names=FALSE) print("ALPACA successfully finished!") print("Use str(myGLM) to check the results, and a list of p-values by SNP is available at myGLM.results.csv") return(list(P.value=P.value, cutoff.final=cutoff.final, sig.SNP=sig.SNP, sig.SNP.P=P.value[sig.SNP], order.SNP=order.SNP, power.fdr.type1error=power.fdr.type1error)) } ###Function used to detect power, fdr and type-I-error. ##Input #order.SNP -- SNPs positions in order #QTN.position -- simulated QTN positions ##Output a list contain #power #fdr #type I error power.fdr <- function(order.SNP, QTN.position) { pwr <- c() fdr <- c() t1error <- c() NQTN <- length(QTN.position) nsnp <- length(order.SNP) for (m in (1: nsnp)) { detected <- intersect(order.SNP[1:m], QTN.position) falsePositive <- setdiff(order.SNP[1:m], QTN.position) pwr <- c(pwr, length(detected)/NQTN) fdr <- c(fdr, length(falsePositive)/m) t1error <- c(t1error, length(falsePositive)/nsnp) } return(list(power=pwr, fdr=fdr, type1error=t1error)) }