###ALPACA package###
#Authors: Haixiao Dong and Ryan Oliveira
#Date:March, 2017

###ALPACA function
##input: y, X, C, PCA.u
#n is number of individuals, m is number of markers, t is nuber of covariates
#y -- phenotype, n by 1 dimension
#X -- genotype, n by m dimension
#C -- covariate, n by t dimension, default: NULL
#myGM -- SNP positions
#PCA.M -- Major PCAs used as co-factors, default: c(1,2,3)
#QTN.position -- simulated QTN positions, default: NULL
#cutoff -- choose p cutoff (e.g. 10^-6), default: alpha = 0.05 with Bonferroni correction

###output a list contain: 
#P.value -- probality values, 1 by m dimesion
#cutoff.final -- cutoff value finally used
#sig.SNP -- significant SNPs under final cutoff
#sig.SNP.P -- P values corresponding to the sig.SNP
#order.SNP -- All SNPs based on the ascending order of P.values
#power.fdr.type1error -- Power, fdr, type I error corresponding to order.SNP

ALPACA <- function(y=NULL, X=NULL, C=NULL, GM=NULL, PCA.M=1:3, QTN.position=NULL, cutoff=NULL){
  print("Welcome to using ALPACA!")
  ###check and copy input data
  GD=X
  CV=C
  PCA=prcomp(GD)
  
  ##automatically eliminate own co-factors if linear dependent to the covariates provided by users
  fD=NULL
  if (!is.null(CV)){
    fD <- fixDependence(as.matrix(CV), PCA$x[, PCA.M])
  }
  if (length(fD)>0){
    PCA.M = c()
    print("Linear dependence exists, own co-factors (PCA) eliminated!")
  } else {
    write.csv(x=PCA$x[, PCA.M], file="ALPACA.PCA.csv")
    pdf(file="ALPACA.pdf", onefile=TRUE)
    plot(PCA$x[, 1], PCA$x[, 2], xlab="PCA1", ylab="PCA2", col="red")
    plot(PCA$x[, 1], PCA$x[, 3], xlab="PCA1", ylab="PCA3", col="red")
    plot(PCA$x[, 2], PCA$x[, 3], xlab="PCA2", ylab="PCA3", col="red")
    print("PCA[, 1:3] used as default, plots and table generated!")
    dev.off()
  }

  #P values calculation, from zhiwu's slides
  n=nrow(GD)
  m=ncol(GD)
  P=matrix(NA,1,m)
  for (i in 1:m){
    x=GD[, i]
    if(max(x)==min(x)){
      p=1}else{
        X=as.matrix(cbind(1, CV, PCA$x[,PCA.M], x))
        LHS=t(X)%*%X
        C=solve(LHS)
        RHS=t(X)%*%y
        b=C%*%RHS
        yb=X%*%b
        e=y-yb
        n=length(y)
        ve=sum(e^2)/(n-1)
        vt=C*ve
        t=b/sqrt(diag(vt))
        p=2*(1-pt(abs(t),n-2))
      } #end of testing variation
    P[i]=p[length(p)]
  } #end of looping for markers
  
  P.value=P
  order.SNP=order(P.value)
  
  #If cutoff is default, uses Bonferroni; else uses -log(value)
  cutoff.final=(ifelse(
    is.null(cutoff),
    0.05/m,
    cutoff
  ))
  sig.SNP <- order.SNP[sort(P.value)<=cutoff.final]
  
  ###power.fdr.type1error calculation
  power.fdr.type1error=NULL
  if (!is.null(QTN.position)){
  power.fdr.type1error=power.fdr(order.SNP, QTN.position)
  }
  ###
  zeros=P==0
  P[zeros]=1e-20

  ###Generate Manhattan plot
  #color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10)
  color.vector <- rep(rainbow(5), length(unique(GM[,2])))
  pdf(file="GLM.Manhattan.pdf")
  plot(t(-log10(P))~seq(1:m),col=color.vector[GM[,2]], 
       xlab="", ylab="-log10(p)")
  if (!is.null(QTN.position)){
    points(QTN.position, -log10(P[QTN.position]), type="p", pch=20, cex=1,lwd=1.5,col="dimgrey")
    points(QTN.position, -log10(P[QTN.position]), type="p", pch=21, cex=2,lwd=1.5,col="dimgrey")
  }
  abline(h=-log10(cutoff.final), col="green")
  print("Manhattan plot generated!")
  dev.off()
  
  ###Generate QQ plot
  p.obs=P
  m2=length(p.obs)
  p.uni=runif(m2,0,1)
  order.obs=order(p.obs)
  order.uni=order(p.uni)

  pdf(file="GLM.QQ.pdf")
  plot(-log10(p.uni[order.uni]), -log10(p.obs[order.obs]), 
       xlab="Expected -log10(p)", ylab="Observed -log10(p)")
  abline(a=0, b=1, col="red")
  print("QQ plot generated!")
  dev.off()
  
  ###Writes SNP names and p-values as a two-column, unlabeled, space-delimited text file
  myGLM.results = matrix((c(row.names(t(myGD))[-1],myGM[,2],myGM[,3],P.value,rank(P.value))),nrow=3093,ncol=5)
  colnames(myGLM.results)=c("SNP ","Chromosome ","Position ","P value ","Order ")
  write.csv(myGLM.results,file="myGLM.results.csv",row.names=FALSE)
  print("ALPACA successfully finished!")
  print("Use str(myGLM) to check the results, and a list of p-values by SNP is available at myGLM.results.csv")
  return(list(P.value=P.value, cutoff.final=cutoff.final, sig.SNP=sig.SNP, sig.SNP.P=P.value[sig.SNP], order.SNP=order.SNP, power.fdr.type1error=power.fdr.type1error))
}

###Function used to detect power, fdr and type-I-error.
##Input
#order.SNP -- SNPs positions in order
#QTN.position -- simulated QTN positions

##Output a list contain
#power
#fdr
#type I error
power.fdr <- function(order.SNP, QTN.position) {
  pwr <- c()
  fdr <- c()
  t1error <- c()
  NQTN <- length(QTN.position)
  nsnp <- length(order.SNP)
  for (m in (1: nsnp)) {
    detected <- intersect(order.SNP[1:m], QTN.position)
    falsePositive <- setdiff(order.SNP[1:m], QTN.position)
    pwr <- c(pwr, length(detected)/NQTN)
    fdr <- c(fdr, length(falsePositive)/m)
    t1error <- c(t1error, length(falsePositive)/nsnp)
  }
  return(list(power=pwr, fdr=fdr, type1error=t1error))
}