############################################################################################## GAPIT <- function(Y=NULL,G=NULL,GD=NULL,GM=NULL,KI=NULL,Z=NULL,CV=NULL,emmaXp3d=TRUE, groupFrom=1 ,groupTo=1000000,groupBy=10,CA="average", KT='Mean',kinMethod="VanRaden", ngrid = 100, llin = -10, ulim = 10, esp = 1e-10, dataPath=NULL,numFiles=1,GFile=NULL, GFileExt=NULL,GDFile=NULL, GMFile=NULL, GDFileExt=NULL,GMFileExt=NULL, mafRate=0.05,FDR.Rate = 0.05,FDR.Filter.Rate=1,model="Add",numPCs=0){ GAPIT.Version="1.25" #Object: To perform GWAS and GPS (Genomic Prediction/Selection) #Output: See GAPIT.Main #Authors: Zhiwu Zhang #Last update: August 10, 2011 ############################################################################################## #Input defination #Y #This is phenotype data #Genotype data can appear as either hapmap or numerical, but not both #G #This is genotype data in format of hapmap #GD #They pair of genotype (data and map)with genotype data in numerical format #GM #Genotype can be loaded as above genotype data. The other option is by file names #numFiles #Number of Genotype files #dataPath #The location of genotype files #GFile #Common file name (before the numerical part) for hapmap format #GFileExt #Common file extention (after the numerical part) #GDFile #Common file name (before the numerical part) for genotype data #GMFile= #Common file name (before the numerical part) for genotype map #GDFileExt #Common file extention (after the numerical part) for genotype data #GMFileExt #Common file extention (after the numerical part)for genotype map #KI #This is kinship data, set it to NULL in case that geneotype files are used for estimation #CV #This is the covariate variables of fixed effects, such as population structure #Z #This is the customized Z matrix #groupFrom #Lower bound for number of group #groupTo #Upper bound for number of group #groupBy #rang between 1 and number of individuals, smaller the finner optimization #CA #clustering method: "average", "complete", "ward", "single", "mcquitty", "median","centroid". Example: CA=c("complete","average") #KT #Group kinship tppe: "Mean", "Max", "Min", "Median". Example: KT=c("Mean","Max") #emmaXp3d #This is the option to use P3D (TRUE) or not (FALSE) #model #Genetic model for SNP effect: "Add" or "Dom" #mafRate #The SNP below this rate will be removed from reports #FDR.Rate #Alex will add explainatino #kinMethod #The method to estimate kinship from genotype files. Options are "EMMA", "Loiselle" #numPCs= #Numer of PCS derived from genotype as population structure #--------------------------------------------------------------------------------------------------------------------< if(!is.null(Y)){ if (ncol(Y)<2) stop ("Phenotype should have taxa name and one trait at least. Please correct phenotype file!") for (trait in 2: ncol(Y)) { gapitMain <- GAPIT.Main(Y=Y[,c(1,trait)],G=G,GD=GD,GM=GM,KI=KI,Z=Z,CV=CV,emmaXp3d=emmaXp3d,kinMethod=kinMethod, groupFrom=groupFrom,groupTo=groupTo,groupBy=groupBy,CA=CA,KT=KT,name.of.trait = colnames(Y)[trait], dataPath=dataPath,numFiles=numFiles,GFile=GFile,GFileExt=GFileExt,GDFile=GDFile, GMFile=GMFile, GDFileExt=GDFileExt,GMFileExt=GMFileExt, mafRate = mafRate,FDR.Rate = FDR.Rate,FDR.Filter.Rate=FDR.Filter.Rate,model=model,numPCs=numPCs,GAPIT.Version=GAPIT.Version, GT=NULL) return (list(KI=gapitMain$KI,PC=gapitMain$PC)) }# end of loop on trait }# end ofdetecting null Y #Calculating kinship or PC in case that phenotype is not provided if(is.null(Y)){ myGenotype<-GAPIT.Genotype(G=G,GD=GD,GM=GM,KI=KI,kinMethod=kinMethod,numPCs=numPCs, dataPath=dataPath,numFiles=numFiles,GFile=GFile, GFileExt=GFileExt,GDFile=GDFile, GMFile=GMFile, GDFileExt=GDFileExt,GMFileExt=GMFileExt, mafRate=mafRate,FDR.Rate = FDR.Rate,FDR.Filter.Rate=FDR.Filter.Rate,model="Add") Timmer=myGenotype$Timmer Memory=myGenotype$Memory return (list(KI=myGenotype$KI,PC=myGenotype$PC)) } print("GAPIT accomplished successfully!") } #end of GAPIT function ############################################################################################## GAPIT.Main <- function(Y,G=NULL,GD=NULL,GM=NULL,KI=NULL,Z=NULL,CV=NULL,emmaXp3d=TRUE, groupFrom=1000000 ,groupTo=1,groupBy=10,CA="average", KT='Mean',kinMethod=NULL, ngrid = 100, llin = -10, ulim = 10, esp = 1e-10, dataPath=NULL,numFiles=NULL,GFile=NULL, GFileExt=NULL,GDFile=NULL, GMFile=NULL, GDFileExt=NULL,GMFileExt=NULL, mafRate=0.05,FDR.Rate = 0.05,FDR.Filter.Rate=1,model="Add",numPCs=0, GAPIT.Version=GAPIT.Version, name.of.trait, GT = NULL){ #Object: To perform GWAS and GPS (Genomic Prediction/Selection) #Output: GWAS table (text file), QQ plot (PDF), Manhattan plot (PDF), genomic prediction (text file), and # genetic and residual variance components #Authors: Zhiwu Zhang # Last update: may 12, 2011 ############################################################################################## Timmer=GAPIT.Timmer(Infor="GAPIT (GWAS and GS in R)") Memory=GAPIT.Memory(Infor="GAPIT (GWAS and GS in R)") myGenotype<-GAPIT.Genotype(G=G,GD=GD,GM=GM,KI=KI,kinMethod=kinMethod,numPCs=numPCs, dataPath=dataPath,numFiles=numFiles,GFile=GFile, GFileExt=GFileExt,GDFile=GDFile, GMFile=GMFile, GDFileExt=GDFileExt,GMFileExt=GMFileExt, mafRate=mafRate,FDR.Rate = FDR.Rate,FDR.Filter.Rate=FDR.Filter.Rate,model="Add") Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="Kin and PCA") Memory=GAPIT.Memory(Memory=Memory,Infor="Kin and PCA") KI=myGenotype$KI PC=myGenotype$PC hasGenotype=myGenotype$hasGenotype GD=myGenotype$GD GI=myGenotype$GI GT=myGenotype$GT #Examine status of datasets if(is.null(Y)) stop ("Phenotype must exist.") if(is.null(KI)&missing(GD)) stop ("Kinship is required. As genotype is not provided, kinship can not be created.") #When GT and GD are missing, force to have fake ones (creating them from Y) if(is.null(GD) & is.null(GT)) { GT=data.frame(Y[,1]) GD=matrix(1,nrow(Y),1) } #merge CV with PC if(numPCs>0&!is.null(CV))CV=GAPIT.CVMergePC(CV,PC) if(numPCs>0&is.null(CV))CV=PC #Create Z as identity matrix from Y if it is not provided if(is.null(Z)){ taxa=as.character(Y[,1]) Z=as.data.frame(diag(1,nrow(Y))) Z=rbind(taxa,Z) taxa=c('Taxa',as.character(taxa)) Z=cbind(taxa,Z) } #Add the part of non proportion in Z matrix if(!is.null(Z)&nrow(Z)-1groupTo) stop("groupTo should be larger than groupFrom. Please correct them!") if(!is.null(CV)& groupTonrow(qc$KI)) { groupTo=nrow(qc$KI) #maximum of group is number of rows in KI if(groupFrom>nrow(qc$KI)) groupFrom=nrow(qc$KI) warning("The upper bound of groups is too high. It was set to the size of kinship!") } #Optimization for group number, cluster algorithm and kinship type GROUP=seq(groupTo,groupFrom,by=-groupBy)#The reverse order is to make sure to include full model if(missing("CA")) CA=c("ward", "single", "complete", "average", "mcquitty", "median", "centroid") if(missing("KT")) KT=c("Mean", "Max", "Min", "Median") numSetting=length(GROUP)*length(CA)*length(KT) optOnly=FALSE if(numSetting>1 & hasGenotype) optOnly=TRUE #Reform Y, GD and CV into EMMA format ys=as.matrix(qc$Y[2]) X0=as.matrix(qc$CV[,-1]) xs.full=GD[qc$GTindex,] xs.full=as.data.frame(xs.full) xs.reduced=as.data.frame(xs.full[1,]) if( numSetting==1) { xs=as.matrix(xs.full) }else{ xs=as.matrix(xs.reduced) } #Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="PreP3D 4") #Memory=GAPIT.Memory(Memory=Memory,Infor="PreP3D 4") #Initial REMLs.best=10E20 ca.best=CA[1] group.best=GROUP[1] kt.best=KT[1] count=0 compresion.KT<- list(NULL) compresion.CA<- list(NULL) compresion.GROUP<- list(NULL) compresion.REML<- list(NULL) compresion.VG<- list(NULL) compresion.VE<- list(NULL) #add intersection overall <- rep(1, length(ys)) if(min(X0[,1])!=max(X0[,1])) X0 <- cbind(overall, X0) #do not add overall mean if X0 has it already at first column #--------------------------------------------------------------------------------------------------------------------# print("Compressing..." ) print(paste("The total combinations: ", numSetting,sep="")) #Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="DataProcessing") #Memory=GAPIT.Memory(Memory=Memory,Infor="DataProcessing") #Loop to optimize cluster algorithm, group number and kinship type for (ca in CA){ for (group in GROUP){ for (kt in KT){ #Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="PreP3D 1") #Memory=GAPIT.Memory(Memory=Memory,Infor="PreP3D 1") count=count+1 if(group1) { #Reform KW and Z into EMMA format K=as.matrix(bk.best$KW) z0=as.matrix(zc.best$Z[,-1]) Z=matrix(as.numeric(z0),nrow=nrow(z0),ncol=ncol(z0)) xs=as.matrix(xs.full) p3d <- GAPIT.EMMAxP3D(ys=ys,xs=xs,K = K ,Z=Z,X0=X0,GI=GI,emmaXp3d=emmaXp3d,Timmer=Timmer,Memory=Memory, dataPath=dataPath,numFiles=numFiles,GFile=GFile,GFileExt=GFileExt,GDFile=GDFile, GMFile=GMFile, GDFileExt=GDFileExt,GMFileExt=GMFileExt, GTindex=qc$GTindex,genoFormat=genoFormat,optOnly=FALSE,model=model) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="GWAS") Memory=GAPIT.Memory(Memory=Memory,Infor="GWAS") } #Merge p value for multiple genotype files if(numFiles>1) { theP <- read.table(paste("GAPIT.TMP.ps.","1",".txt",sep=""), head = FALSE) theMAF <- read.table(paste("GAPIT.TMP.maf.","1",".txt",sep=""), head = FALSE) thenobs <- read.table(paste("GAPIT.TMP.nobs.","1",".txt",sep=""), head = FALSE) colnames(theP)="P" colnames(theMAF )="MAF" colnames(thenobs )="nobs" ps=theP MAF=theMAF nobs=thenobs for (file in 2:numFiles) { theGI <- read.table(paste("GAPIT.TMP.GI.",file,".txt",sep=""), head = TRUE) theP <- read.table(paste("GAPIT.TMP.ps.",file,".txt",sep=""), head = FALSE) theMAF <- read.table(paste("GAPIT.TMP.maf.",file,".txt",sep=""), head = FALSE) thenobs <- read.table(paste("GAPIT.TMP.nobs.",file,".txt",sep=""), head = FALSE) colnames(theP)="P" colnames(theMAF )="MAF" colnames(thenobs )="nobs" ps=rbind(ps,theP) MAF=rbind(MAF,theMAF) nobs=rbind(nobs,thenobs) GI=rbind(GI,data.frame(as.matrix(theGI))) } p3d$ps=ps p3d$maf=MAF p3d$nobs=nobs } #genomic prediction if(length(bk.best$KW)>ncol(X0)) gs <- GAPIT.GS(KW=bk.best$KW,KO=bk.best$KO,KWO=bk.best$KWO,GAU=bk.best$GAU,UW=cbind(p3d.best$BLUP,p3d.best$PEV)) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="GPS") Memory=GAPIT.Memory(Memory=Memory,Infor="GPS") #--------------------------------------------------------------------------------------------------------------------# #Final report print("Generating summary" ) #output BLUP and PEV print("Genomic Breeding Values (GBV) ..." ) file=paste("GAPIT.", name.of.trait,".BLUP.csv" ,sep = "") if(length(bk.best$KW)>ncol(X0)) write.table(gs$BLUP, file, quote = FALSE, sep = ",", row.names = FALSE,col.names = TRUE) #Make heatmap for distribution of BLUP and PEV print("GBV and accuracy distribution..." ) if(length(bk.best$KW)>ncol(X0)) GAPIT.GS.Visualization(gsBLUP = gs$BLUP, BINS=20,name.of.trait = name.of.trait) #Make a plot Summarzing the Compression Results, if more than one "compression level" has been assessed print("Compression portfolios..." ) GAPIT.Compression.Visualization(Compression = Compression, name.of.trait = name.of.trait) #Export GWAS results if(hasGenotype ) { print("Filtering SNPs with MAF..." ) PWI.Filtered <- GAPIT.Filter.SNPs.with.Low.MAFs(GI = GI, P.values = as.matrix(p3d$ps), maf = as.matrix(p3d$maf), nobs = as.matrix(p3d$nobs), mafRate = mafRate) #Run the BH multiple correction procedure of the results #Create PWIP, which is a table of SNP Names, Chromosome, bp Position, Raw P-values, FDR Adjusted P-values print("Calculating FDR..." ) PWIP <- GAPIT.Perform.BH.FDR.Multiple.Correction.Procedure(PWI = PWI.Filtered, FDR.Rate = FDR.Rate, FDR.Procedure = "BH") if(!is.null(PWI.Filtered)) { #QQ plots print("QQ plot..." ) GAPIT.QQ(P.values = PWIP$PWIP[,4], name.of.trait = name.of.trait,numSlots=10000) #Manhattan Plots print("Manhattan plot..." ) GAPIT.Manhattan(GI.MP = PWIP$PWIP[,2:4], name.of.trait = name.of.trait, numSlots=10000, plot.type = "Chromosomewise") GAPIT.Manhattan(GI.MP = PWIP$PWIP[,2:4], name.of.trait = name.of.trait, numSlots=50000, plot.type = "Genomewise") #Summary Table print("Association table..." ) GAPIT.Table(final.table = PWIP$PWIP, name.of.trait = name.of.trait,FDR.Filter.Rate=FDR.Filter.Rate) } } #Log log=GAPIT.Log(Y=Y,KI=KI,Z=Z,CV=CV,emmaXp3d=emmaXp3d, groupFrom = groupFrom ,groupTo =groupTo ,groupBy = groupBy ,CA = CA, KT= KT, ngrid = ngrid , llin = llin , ulim = ulim , esp = esp ,name.of.trait = name.of.trait) #Memory usage #GAPIT.Memory.Object(name.of.trait=name.of.trait) #Timming Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="Report") Memory=GAPIT.Memory(Memory=Memory,Infor="Report") file=paste("GAPIT.", name.of.trait,".Timming.csv" ,sep = "") write.table(Timmer, file, quote = FALSE, sep = ",", row.names = FALSE,col.names = TRUE) file=paste("GAPIT.", name.of.trait,".Memory.Stage.csv" ,sep = "") write.table(Memory, file, quote = FALSE, sep = ",", row.names = FALSE,col.names = TRUE) print(paste(name.of.trait, "has been analyzed successfully!") ) print(paste("The reulsts are saved in the directory of ", getwd()) ) print("---------------------------------------------------------------------------------------") return (list(Timmer=Timmer,Compression=Compression,KI=KI,PC=PC)) }#The function GAPIT.Main ends here ############################################################################################## GAPIT.EMMAxP3D <- function(ys,xs,K=NULL,Z=NULL,X0=NULL,GI=NULL,emmaXp3d=TRUE,Timmer,Memory,ngrids=100,llim=-10,ulim=10,esp=1e-10, dataPath=NULL,numFiles=1,GFile=NULL,GFileExt=NULL,GTindex=NULL,GDFile=NULL, GMFile=NULL, GDFileExt=NULL,GMFileExt=NULL,genoFormat="Hapmap", optOnly=TRUE,model="Add"){ #Object: To esimate variance component by using EMMA algorithm and perform GWAS with P3D/EMMAx #Output: ps, REMLs, stats, dfs, vgs, ves, BLUP, BLUP_Plus_Mean, PEV #Authors: Feng Tian, Alex Lipka and Zhiwu Zhang # Last update: April 26, 2011 # Library used: EMMA (Kang et al, Genetics, Vol. 178, 1709-1723, March 2008) # Note: This function was modified from the function of emma.REML.t from the library ############################################################################################## Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="P3D Start") Memory=GAPIT.Memory(Memory=Memory,Infor="P3D Start") #--------------------------------------------------------------------------------------------------------------------< #Change data to matrix format if they are not if (is.null(dim(ys)) || ncol(ys) == 1) ys <- matrix(ys, 1, length(ys)) if (is.null(X0)) X0 <- matrix(1, ncol(ys), 1) #handler of special Z and K if(!is.null(Z)){ if(ncol(Z) == nrow(Z)) Z = NULL } if(!is.null(K)) {if(length(K)<2) K = NULL} #Extract dimension information g <- nrow(ys) #number of traits n <- ncol(ys) #number of observation q0 <- ncol(X0)#number of fixed effects q1 <- q0 + 1 #Nuber of fixed effect including SNP nr=n if(!is.null(K)) tv=ncol(K) #decomposation without fixed effect if(!is.null(K)) eig.L <- emma.eigen.L(Z, K) #this function handle both NULL Z and non-NULL Z matrix #if(!is.null(K)) eig.L$values[which(eig.L$values<0)]=0 #Negative eigen values Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="eig.L") Memory=GAPIT.Memory(Memory=Memory,Infor="eig.L") #decomposation with fixed effect (SNP not included) X <- X0 #covariate variables such as population structure if (!is.null(Z) & !is.null(K)) eig.R <- emma.eigen.R.w.Z(Z, K, X) #This will be used to get REstricted ML (REML) if (is.null(Z) & !is.null(K)) eig.R <- emma.eigen.R.wo.Z( K, X) #This will be used to get REstricted ML (REML) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="eig.R") Memory=GAPIT.Memory(Memory=Memory,Infor="eig.R") #--------------------------------------------------------------------------------------------------------------------> #Loop on Traits for (j in 1:g) { #--------------------------------------------------------------------------------------------------------------------< if(!is.null(K)) REMLE <- emma.REMLE(ys[j,], X, K, Z, ngrids, llim, ulim, esp, eig.R) if (!is.null(Z) & !is.null(K)) U <- eig.L$vectors * matrix(c(sqrt(1/(eig.L$values + REMLE$delta)),rep(sqrt(1/REMLE$delta),nr - tv)),nr,((nr-tv)+length(eig.L$values)),byrow=TRUE) if ( is.null(Z) & !is.null(K)) U <- eig.L$vectors * matrix( sqrt(1/(eig.L$values + REMLE$delta)),nr,length(eig.L$values),byrow=TRUE) x.prev <- vector(length = 0) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="REML") Memory=GAPIT.Memory(Memory=Memory,Infor="REML") #--------------------------------------------------------------------------------------------------------------------> if(optOnly) numFiles=1 #Skip sreening multiple genotype files #Add loop for genotype data files for (file in 1:numFiles) { Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="New Genotype file") Memory=GAPIT.Memory(Memory=Memory,Infor="New Genotype file") #update xs for each file if(file>1) { rm(xs) gc() #Hapmap format if(genoFormat=="Hapmap") { print("debug: reading G") G <- read.table(paste(dataPath,GFile,file, ".",GFileExt,sep=""), head = FALSE) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="Genotype loaded") Memory=GAPIT.Memory(Memory=Memory,Infor="Genotype loaded") #Convert HapMap to numerical print(paste("Converting genotype...",sep="")) hm=GAPIT.HapMap(G,model=model) print(paste("Converting genotype done.",sep="")) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="HapMap") Memory=GAPIT.Memory(Memory=Memory,Infor="HapMap") G=NULL gc() Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="G removed") Memory=GAPIT.Memory(Memory=Memory,Infor="G removed") GT=hm$GT print(paste("GT Generated.",sep="")) Memory=GAPIT.Memory(Memory=Memory,Infor="GT Generated") GI=hm$GI print(paste("GI Generated.",sep="")) Memory=GAPIT.Memory(Memory=Memory,Infor="GI Generated") xs=hm$GD[GTindex,] print(paste("xs Generated.",sep="")) Memory=GAPIT.Memory(Memory=Memory,Infor="xs Generated") rm(hm) gc() Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="hm removed") Memory=GAPIT.Memory(Memory=Memory,Infor="hm removed") } #nd of exising G #EMMA format if(genoFormat=="EMMA") { print("debug: reading GD and GM") GI <- read.table(paste(dataPath,GMFile,file, ".",GMFileExt,sep=""), head = TRUE) GD <- read.table(paste(dataPath,GDFile,file, ".",GDFileExt,sep=""), head = TRUE) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="Genotype data loaded)") Memory=GAPIT.Memory(Memory=Memory,Infor="Genotype data loaded") GT=as.matrix(GD[,1]) #get taxa GD=GD[,-1] #remove taxa column xs=GD[GTindex,] Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="GT created from GD)") Memory=GAPIT.Memory(Memory=Memory,Infor="GT created from GD") } } # end of multiple genotype files if (is.null(dim(xs)) || nrow(xs) == 1) xs <- matrix(xs, length(xs),1) m <- ncol(xs) #number of SNPs t <- nrow(xs) #number of individuals #allocate spaces for SNPs dfs <- matrix(nrow = m, ncol = g) stats <- matrix(nrow = m, ncol = g) ps <- matrix(nrow = m, ncol = g) nobs <- matrix(nrow = m, ncol = g) maf <- matrix(nrow = m, ncol = g) #print(paste("Memory allocated.",sep="")) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="Memory allocation") Memory=GAPIT.Memory(Memory=Memory,Infor="Memory allocation") if(optOnly)mloop=0 if(!optOnly)mloop=m #Loop on SNPs print(paste("Number of SNPs is ",mloop," in genotype file ",file, sep="")) for (i in 0:mloop){ #print(i) #--------------------------------------------------------------------------------------------------------------------< normalCase=TRUE if ((i>0)&(floor(i/1000)==i/1000)) print(paste("Genotype file: ", file,", SNP: ",i," ",sep="")) # To extract current snp. It save computation for next one in case they are identical if(i==0&file==1){ #For the model without fitting SNP vids <- !is.na(ys[j,]) #### Feng changed xv <- ys[j, vids]*0+1 #### Feng changed } if(i>0){ vids <- !is.na(xs[,i]) #### Feng changed xv <- xs[vids,i] #### Feng changed vids.TRUE=which(vids==TRUE) vids.FALSE=which(vids==FALSE) ns=length(xv) ss=sum(xv) maf[i]=min(.5*ss/ns,1-.5*ss/ns) nobs[i]=ns } #Situation of no variation for SNP except the fisrt one(synthetic for EMMAx/P3D) if ((min(xv) ==max(xv) )&i>0) { dfs[i, ] <- rep(NA, g) stats[i, ] <- rep(NA, g) ps[i, ] = rep(1, g) normalCase=FALSE } #Situation of the SNP is identical to previous if (identical(x.prev, xv)) { dfs[i, ] <- dfs[i - 1, ] stats[i, ] <- stats[i - 1, ] ps[i, ] <- ps[i - 1, ] normalCase=FALSE } #--------------------------------------------------------------------------------------------------------------------> #Normal case if(normalCase) { #--------------------------------------------------------------------------------------------------------------------< #nv <- sum(vids) yv <- ys[j, vids] #### Feng changed nr <- sum(vids) #### Feng changed if (!is.null(Z) & !is.null(K)) { r<- ncol(Z) ####Feng, add a variable to indicate the number of random effect vran <- vids[1:r] ###Feng, add a variable to indicate random effects with nonmissing genotype tv <- sum(vran) #### Feng changed } #--------------------------------------------------------------------------------------------------------------------> #--------------------------------------------------------------------------------------------------------------------< if(i>0) dfs[i, j] <- nr - q1 if(i>0) X <- cbind(X0[vids, , drop = FALSE], xs[vids,i]) #Recalculate eig and REML if not using P3D NOTE THIS USED TO BE BEFORE the two solid lines if(emmaXp3d==FALSE & !is.null(K)) { if (!is.null(Z)) eig.R <- emma.eigen.R.w.Z(Z, K, X) #This will be used to get REstricted ML (REML) if (is.null(Z)) eig.R <- emma.eigen.R.wo.Z( K, X) #This will be used to get REstricted ML (REML) if (!is.null(Z)) REMLE <- emma.REMLE(ys[j,], X, K, Z, ngrids, llim, ulim, esp, eig.R) if ( is.null(Z)) REMLE <- emma.REMLE(ys[j,], X, K, Z = NULL, ngrids, llim, ulim, esp, eig.R) if (!is.null(Z) & !is.null(K)) U <- eig.L$vectors * matrix(c(sqrt(1/(eig.L$values + REMLE$delta)),rep(sqrt(1/REMLE$delta),nr - tv)),nr,((nr-tv)+length(eig.L$values)),byrow=TRUE) if ( is.null(Z) & !is.null(K)) U <- eig.L$vectors * matrix( sqrt(1/(eig.L$values + REMLE$delta)),nr,length(eig.L$values),byrow=TRUE) } if(n==nr) { if(!is.null(K)) { yt <- crossprod(U, yv) Xt <- crossprod(U, X) }else{ yt=yv Xt=X } XX=crossprod(Xt, Xt) if(XX[1,1] == "NaN") { Xt[which(Xt=="NaN")]=0 yt[which(yt=="NaN")]=0 XX=crossprod(Xt, Xt) } XY=crossprod(Xt, yt) } #Missing SNP if(n>nr) { UU=crossprod(U,U) A11=UU[vids.TRUE,vids.TRUE] A12=UU[vids.TRUE,vids.FALSE] A21=UU[vids.FALSE,vids.TRUE] A22=UU[vids.FALSE,vids.FALSE] A22i=try(solve(A22) ) if(inherits(A22i, "try-error")) A22i <- ginv(A22) F11=A11-A12%*%A22i%*%A21 XX=crossprod(X,F11)%*%X XY=crossprod(X,F11)%*%yv } iXX <- try(solve(XX) ) if(inherits(iXX, "try-error")) iXX <- ginv(crossprod(Xt, Xt)) beta <- iXX %*% XY #--------------------------------------------------------------------------------------------------------------------> #--------------------------------------------------------------------------------------------------------------------< if(i==0 &file==1 & !is.null(K)) { Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="ReducedModel") Memory=GAPIT.Memory(Memory=Memory,Infor="ReducdModel") #Calculate the blup gammahat = vgKZprimeVinv*(Y-Xbetahat) vgs <- REMLE$vg ves <- REMLE$ve REMLs <- REMLE$REML XtimesBetaHat <- X %*% beta YminusXtimesBetaHat <- ys[j,]- XtimesBetaHat vgK <- REMLE$vg*K Dt <- crossprod(U, YminusXtimesBetaHat) if (!is.null(Z)) Zt <- crossprod(U, Z) if (is.null(Z)) Zt <- t(U) if(XX[1,1] == "NaN") { Dt[which(Dt=="NaN")]=0 Zt[which(Zt=="NaN")]=0 } BLUP <- K %*% crossprod(Zt, Dt) #Using K instead of vgK because using H=V/Vg grand.mean.vector <- rep(beta[1], length(BLUP)) BLUP_Plus_Mean <- grand.mean.vector + BLUP Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="BLUP") Memory=GAPIT.Memory(Memory=Memory,Infor="BLUP") #PEV C11=try(vgs*solve(crossprod(Xt,Xt))) if(inherits(C11, "try-error")) C11=vgs*ginv(crossprod(Xt,Xt)) C21=-K%*%crossprod(Zt,Xt)%*%C11 Kinv=try(solve(K) ) if(inherits(Kinv, "try-error")) Kinv=ginv(K) if(!is.null(Z)) term.0=crossprod(Z,Z)/ves if(is.null(Z)) term.0=diag(1/ves,nrow(K)) term.1=try(solve(term.0+Kinv/vgs ) ) if(inherits(term.1, "try-error")) term.1=ginv(term.0+Kinv/vgs ) term.2=C21%*%crossprod(Xt,Zt)%*%K C22=(term.1-term.2 ) PEV=as.matrix(diag(C22)) BLUE=X%*%beta Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="PEV") Memory=GAPIT.Memory(Memory=Memory,Infor="PEV") }#end of if(i==0&file==1 & !is.null(K)) #--------------------------------------------------------------------------------------------------------------------> #--------------------------------------------------------------------------------------------------------------------< if(i==0 &file==1 & is.null(K)) { YY=crossprod(yt, yt) ves=(YY-crossprod(beta,XY))/(n-q0) r=yt-X%*%iXX%*%XY REMLs=-.5*(n-q0)*log(det(ves)) -.5*n -.5*(n-q0)*log(2*pi) # REMLs=-.5*n*log(det(ves)) -.5*log(det(iXX)/ves) -.5*crossprod(r,r)/ves -.5*(n-q0)*log(2*pi) vgs = 0 BLUP = 0 BLUP_Plus_Mean = NaN PEV = ves BLUE=X%*%beta } #calculate t statistics and probabilty if(i > 0) { if(!is.null(K)) stats[i, j] <- beta[q1]/sqrt(iXX[q1, q1] *REMLE$vg) if(is.null(K)) stats[i, j] <- beta[q1]/sqrt(iXX[q1, q1] *ves) ps[i, ] <- 2 * pt(abs(stats[i, ]), dfs[i, ],lower.tail = FALSE) } #--------------------------------------------------------------------------------------------------------------------> } # End of if(normalCase) x.prev=xv #update SNP } # End of loop on SNPs Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="Screening SNPs") Memory=GAPIT.Memory(Memory=Memory,Infor="Screening SNPs") #output p value for the genotype file if(numFiles>1) { write.table(GI, paste("GAPIT.TMP.GI.",file,".txt",sep=""), quote = FALSE, sep = "\t", row.names = FALSE,col.names = TRUE) write.table(ps, paste("GAPIT.TMP.ps.",file,".txt",sep=""), quote = FALSE, sep = "\t", row.names = FALSE,col.names = FALSE) write.table(maf, paste("GAPIT.TMP.maf.",file,".txt",sep=""), quote = FALSE, sep = "\t", row.names = FALSE,col.names = FALSE) write.table(nobs, paste("GAPIT.TMP.nobs.",file,".txt",sep=""), quote = FALSE, sep = "\t", row.names = FALSE,col.names = FALSE) #rm(dfs,stats,ps,nobs,maf,GI) #This cause problem on return #gc() } } # Ebd of loop on file } # End of loop on traits Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="GWAS done for this Trait") Memory=GAPIT.Memory(Memory=Memory,Infor="GWAS done for this Trait") return(list(ps = ps, REMLs = -2*REMLs, stats = stats, dfs = dfs,maf=maf,nobs = nobs,Timmer=Timmer,Memory=Memory, vgs = vgs, ves = ves, BLUP = BLUP, BLUP_Plus_Mean = BLUP_Plus_Mean, PEV = PEV, BLUE=BLUE)) #print("GAPIT.EMMAxP3D accomplished successfully!") }#end of GAPIT.EMMAxP3D function ############################################################################################## GAPIT.ZmatrixFormation <- function(Z,Y){ #Object: To expande the proportion Z to final Z #Output: Z #Authors: Zhiwu Zhang # Last update: April 22, 2011 ############################################################################################## #split individuals in Y to the ones that are given Z and the one not taxa.Z=as.matrix(Z[-1,1]) taxa.Y=as.matrix(Y[,1]) taxa.diff=setdiff(taxa.Y,taxa.Z) taxa.I=as.matrix(taxa.Y[match(taxa.diff,taxa.Y,nomatch = 0)]) taxa.Z.col=as.matrix(Z[1,-1]) #Create final Z with zero block and identity block Z0=matrix(data=0,nrow=nrow(taxa.Z),ncol=nrow(taxa.I)) Z1=diag(1,nrow(taxa.I)) ZC=as.matrix(rbind(Z0,Z1)) #To label rows and columns label.row=rbind(as.matrix(Z[,1]),taxa.I) label.col=t(taxa.I) #update the zero block by the given Z matrix position=t(as.matrix(match(taxa.Z.col,taxa.I,nomatch = 0))) ZC[1:nrow(taxa.Z),position]=as.matrix(Z[-1,-1]) #habdler of parents do not have phenotype (colums of Z are not in taxa.I) # To do list #To form final Z matrix dataPart=rbind(label.col,ZC) Z=data.frame(cbind(label.row,dataPart)) #print("GAPIT.ZmatrixFormation accomplished successfully!") return(Z) }#The function GAPIT.ZmatrixFormation ends here ############################################################################################## GAPIT.QC <- function(Y,KI,GT,CV,Z){ #Object: to do data quality control #Output: Y, KI, GD, CV, Z, flag #Authors: Zhiwu Zhang and Alex Lipka # Last update: April 14, 2011 ############################################################################################## #Remove missing phenotype Y=Y[which(Y[,2]!="NaN"),] # Remove duplicates: # GT row wise, Z column wise, and KI both direction. if(exists("GT")) { taxa.kept=unique(GT[,1]) } taxa.all=KI[,1] taxa.uniqe=unique(taxa.all) position=match(taxa.uniqe, taxa.all,nomatch = 0) position.addition=cbind(1,t(1+position)) KI=KI[position,position.addition] if(exists("Z")) { taxa.all=as.matrix(Z[1,]) taxa.uniqe=intersect(taxa.all,taxa.all) position=match(taxa.uniqe, taxa.all,nomatch = 0) Z=Z[,position] } #Remove the columns of Z if they are not in KI/GT. KI/GT are allowed to have individuals not in Z taxa.all=KI[,1] taxa.kinship=unique(taxa.all) taxa.Z=as.matrix(Z[1,]) #taxa.Z=colnames(Z) #This does not work for names starting with numerical or "-" taxa.Z_K_common=intersect(taxa.kinship,taxa.Z) Z <-cbind(Z[,1], Z[,match(taxa.Z_K_common, taxa.Z, nomatch = 0)]) #Remove the rows of Z if all the ellements sum to 0 Z1=Z[-1,-1] Z2=data.frame(Z1) Z3=as.matrix(Z2) Z4=as.numeric(Z3) #one dimemtion Z5=matrix(data = Z4, nrow = nrow(Z1), ncol = ncol(Z1)) RS=rowSums(Z5)>0 #The above process could be simplified! Z <- Z[c(TRUE,RS),] #make individuals the same in Z, Y, GT and CV # get intersect of all the data taxa=intersect(Z[-1,1],Y[,1]) if(exists("GT"))taxa=intersect(taxa,taxa.kept) if(exists("CV"))taxa=intersect(taxa,CV[,1]) #keep the common ones t=c(TRUE, Z[-1,1]%in%taxa) Z <- Z[t,] #Remove the columns of Z if all the ellements sum to 0 Z1=Z[-1,-1] Z2=data.frame(Z1) Z3=as.matrix(Z2) Z4=as.numeric(Z3) #one dimemtion Z5=matrix(data = Z4, nrow = nrow(Z1), ncol = ncol(Z1)) CS=colSums(Z5)>0 #The above process could be simplified! Z <- Z[,c(TRUE,CS)] Y <- Y[Y[,1]%in%taxa,] if(exists("GT")) taxa.kept=data.frame(taxa.kept[taxa.kept%in%taxa]) if(exists("CV")) CV=CV[CV[,1]%in%taxa,] #get position of taxa.kept in GT position=match(taxa.kept[,1], GT[,1],nomatch = 0) #To sort Y, GT, CV and Z Y=Y[order(Y[,1]),] CV=CV[order(CV[,1]),] order.taxa.kept=order(taxa.kept[,1]) Z=Z[c(1,1+order(Z[-1,1])),] GTindex=position[order.taxa.kept] flag=nrow(Y)==nrow(Z)-1&nrow(Y)==nrow(GT)&nrow(Y)==nrow(CV) #print("GAPIT.QC accomplished successfully!") return(list(Y = Y, KI = KI, GT = GT, CV = CV, Z = Z, GTindex=GTindex, flag=flag)) }#The function GAPIT.QC ends here ############################################################################################## GAPIT.Compress <- function(KI,CA = "average",KT = "Mean",GN=nrow(KI),Timmer,Memory){ #Object: To cluster individuals into groups based on kinship #Output: GA, KG #Authors: Alex Lipka and Zhiwu Zhang # Last update: April 14, 2011 ############################################################################################## #For debug #GN=7 Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="CP start") Memory=GAPIT.Memory(Memory=Memory,Infor="cp start") # Extract the line names line.names <- KI[,1] # Remove the first column of the kinship matrix, which is the line names KI <- KI[ ,-1] # Convert kinship to distance distance.matrix <- 2 - KI distance.matrix.as.dist <- as.dist(distance.matrix) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="CP distance") Memory=GAPIT.Memory(Memory=Memory,Infor="cp distance") # hclust() will perform the hiearchical cluster analysis cluster.distance.matrix <- hclust(distance.matrix.as.dist, method = CA) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="CP cluster") Memory=GAPIT.Memory(Memory=Memory,Infor="cp cluster") # Cutree will assign lines into k clusters group.membership <- cutree(cluster.distance.matrix, k = GN) #Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="CP cutree") #Memory=GAPIT.Memory(Memory=Memory,Infor="cp cutree") #calculate group kinship if(KT == "Mean"){ #This matrix operation is much faster than tapply function for "Mean" x=as.factor(group.membership) #b = model.matrix(~x-1) n=max(as.numeric(as.vector(x))) b=diag(n)[x,] KG=t(b)%*%as.matrix(KI)%*%b CT=t(b)%*%(0*as.matrix(KI)+1)%*%b KG=as.matrix(KG/CT) rownames(KG)=c(1:nrow(KG)) colnames(KG)=c(1:ncol(KG)) #Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="CP calculation original") #Memory=GAPIT.Memory(Memory=Memory,Infor="cp calculation original") } gm=as.factor(group.membership) kv=as.numeric(as.matrix(KI)) kvr=rep(gm,ncol(KI)) kvc=as.numeric(t(matrix(kvr,nrow(KI),ncol(KI)))) kInCol=t(rbind(kv,kvr,kvc)) if(KT != "Mean"){ #if (KT == "Mean") # KG<- tapply(kInCol[,1], list(kInCol[,2], kInCol[,3]), mean) if (KT == "Max") KG <- tapply(kInCol[,1], list(kInCol[,2], kInCol[,3]), max) if (KT == "Min") KG <- tapply(kInCol[,1], list(kInCol[,2], kInCol[,3]), min) if (KT == "Median") KG <- tapply(kInCol[,1], list(kInCol[,2], kInCol[,3]), median) } #this is end of brancing "Mean" and the rest Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="CP calculation") Memory=GAPIT.Memory(Memory=Memory,Infor="cp calculation") # add line names GA <- data.frame(group.membership) GA <- data.frame(cbind(as.character(line.names),as.numeric(group.membership) )) #Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="CP Final") #Memory=GAPIT.Memory(Memory=Memory,Infor="CP Final") #print("GAPIT.Compress accomplished successfully!") return(list(GA=GA, KG=KG,Timmer=Timmer,Memory=Memory)) }#The function GAPIT.Compress ends here ############################################################################################## GAPIT.Block <- function(Z,GA,KG){ #Object: To split a group kinship into two blocks containing individuals with and without phenotype #Output: GAU,KW,KO,KWO #Authors: Zhiwu Zhang and Alex Lipka # Last update: April 14, 2011 ############################################################################################## # To separate group kiship into two blocks: with and without phenotype. # A group goes to with phenotype as loog as it has one phenotyped individual. #find position in group assignment (GA) for the individual associate with phenotype (specified by Z) #taxa=unique(intersect(as.matrix(Z[1,-1]),GA[,1])) taxa.Z=as.matrix(Z[1,-1]) taxa.GA=as.matrix(GA[,1]) position=taxa.GA%in%taxa.Z #Initial block as 2 GAU=cbind(GA,2) #Assign block as 1 if the individual has phenotype GAU[position,3]=1 #Modify the non-phenotyped individuals if they in a group with phenotyped individuals #To find the groups with phenotyped individuals #update block assignment for all these groups #get list of group that should be block 1 grp.12=as.matrix(unique(GAU[,2])) grp.1=as.matrix(unique(GAU[which(GAU[,3]==1),2])) grp.2= as.matrix(setdiff(grp.12,grp.1)) numWithout=length(grp.2) order.1=1:length(grp.1) order.2=1:length(grp.2) if(numWithout >0) grpblock=as.matrix(rbind(cbind(grp.1,1,order.1), cbind(grp.2,2,order.2))) if(numWithout==0) grpblock=as.matrix( cbind(grp.1,1,order.1), ) order.block=order(as.matrix(GAU[,3])) colnames(grpblock)=c("grp","block","ID") GAU0 <- merge(GAU[order.block,-3], grpblock, by.x = "X2", by.y = "grp") GAU=GAU0[,c(2,1,3,4)] KW=KG[grp.1,grp.1] KO=KG[grp.2,grp.2] KWO=KG[grp.1,grp.2] #write.table(GAU, "GAU.txt", quote = FALSE, sep = "\t", row.names = TRUE,col.names = TRUE) #print("GAPIT.Block accomplished successfully!") return(list(GAU=GAU,KW=KW,KO=KO,KWO=KWO)) }#The function GAPIT.Block ends here ############################################################################################## GAPIT.ZmatrixCompress <- function(Z,GAU){ #Object: To assign the fraction of a individual belonging to a group #Output: Z #Authors: Zhiwu Zhang # Last update: April 14, 2011 ############################################################################################## #sort Z column wise order.Z=order(as.matrix(Z[1,-1])) Z1=Z[-1,-1] Z1 <- Z1[,order.Z] #Extraction of GAU coresponding to Z, sort GAU rowwise to mach columns of Z, and make design matrix effect.Z=as.matrix(Z[1,-1]) effect.GAU=as.matrix(GAU[,1]) GAU0=GAU[effect.GAU%in%effect.Z,] order.GAU=order(GAU0[,1]) GAU1 <- GAU0[order.GAU,] id.1=GAU1[which(GAU1[,3]==1),4] n=max(as.numeric(as.vector(id.1))) x=as.numeric(as.matrix(GAU1[,4])) DS=diag(n)[x,] #write.table(b, "debug.txt", quote = FALSE, sep = "\t", row.names = TRUE,col.names = TRUE) #write.table(GAU1, "debug2.txt", quote = FALSE, sep = "\t", row.names = TRUE,col.names = TRUE) #Z matrix from individual to group Z1.numeric <- as.numeric(as.matrix(Z1)) Z1.matrix <- matrix(Z1.numeric, nrow = nrow(Z1), ncol = ncol(Z1)) Z2=Z1.matrix%*%DS Z3=data.frame(cbind(as.character(Z[-1,1]),Z2)) Z=Z3[order(Z3[,1]),] #write.table(DS, "debug.txt", quote = FALSE, sep = "\t", row.names = TRUE,col.names = TRUE) #print("GAPIT.ZmatrixCompress accomplished successfully!") return(list(Z=Z)) }#The function GAPIT.ZmatrixCompress ends here ############################################################################################## GAPIT.GS <- function(KW,KO,KWO,GAU,UW){ #Object: to derive BLUP for the individuals without phenotype #Output: BLUP #Authors: Zhiwu Zhang # Last update: April 17, 2011 ############################################################################################## UO=try(t(KWO)%*%solve(KW)%*%UW) if(inherits(UO, "try-error")) UO=t(KWO)%*%ginv(KW)%*%UW n=ncol(UW) #get number of columns, add additional for individual name #Assign BLUP of group to its individuals BLUP=data.frame(as.matrix(GAU[,1:4])) BLUP.W=BLUP[which(GAU[,3]==1),] order.W=order(as.numeric(as.matrix(BLUP.W[,4]))) ID.W=as.numeric(as.matrix(BLUP.W[order.W,4])) n.W=max(ID.W) DS.W=diag(n.W)[ID.W,] ind.W=DS.W%*%UW all.W=cbind(BLUP.W[order.W,],ind.W) all=all.W BLUP.O=BLUP[which(GAU[,3]==2),] if(nrow(BLUP.O)>0){ order.O=order(as.numeric(as.matrix(BLUP.O[,4]))) ID.O=as.numeric(as.matrix(BLUP.O[order.O,4])) n.O=max(ID.O) DS.O=diag(n.O)[ID.O,] ind.O=DS.O%*%UO all.O=cbind(BLUP.O[order.O,],ind.O) all=rbind(all.W,all.O) } colnames(all)=c("Taxa", "Group", "RefInf","ID","BLUP","PEV") #write.table(index.W, "debug.txt", quote = FALSE, sep = "\t", row.names = TRUE,col.names = TRUE) #print("GAPIT.GS accomplished successfully!") return(list(BLUP=all)) }#The function GAPIT.GS ends here ############################################################################################## GAPIT.Numericalization1.Add <- function(x){ #Object: To convert character SNP genotpe to numerical #Output: Coresponding numerical value #Authors: Feng Tian and Zhiwu Zhang # Last update: May 30, 2011 ############################################################################################## x[x=="X"]="N" x[x=="K"]="Z" #K (for GT genotype)is is replaced by Z to ensure heterozygose has the largest value n=length(x) lev=levels(as.factor(x)) lev=setdiff(lev,"N") len=length(lev) #print(lev) #1 or less status if(len<=1)x=0 #2 status if(len==2)x=ifelse(x=="N",1,ifelse(x==lev[1],0,2)) #3 status if(len==3)x=ifelse(x=="N",1,ifelse(x==lev[1],0,ifelse(x==lev[3],1,2))) # more than 3 status if(len> 3){ x=0 warning("Existing none biallelic SNP!") } return(matrix(x,n,1)) }#end of GAPIT.Numericalization1.Add function ############################################################################################## GAPIT.Numericalization1.Dom <- function(x){ #Object: To convert character SNP genotpe to numerical #Output: Coresponding numerical value #Authors: Feng Tian and Zhiwu Zhang # Last update: May 30, 2011 ############################################################################################## x[x=="X"]="N" n=length(x) lev=levels(as.factor(x)) lev=setdiff(lev,"N") len=length(lev) #print(lev) # 2 or 3 status x=ifelse((len<2|len>3),0,ifelse(x=="N",.5,ifelse((x=="R"|x=="Y"|x=="S"|x=="W"|x=="K"|x=="M"),0,1))) return(matrix(x,length(x),1)) }#end of GAPIT.Numericalization1.Dom function ############################################################################################## GAPIT.Numericalization2.Add <- function(x){ #Object: To convert character SNP genotpe to numerical #Output: Coresponding numerical value #Authors: Feng Tian and Zhiwu Zhang # Last update: May 30, 2011 ############################################################################################## x[x=="N"]="NN" x[x=="X"]="NN" n=length(x) lev=levels(as.factor(x)) lev=setdiff(lev,"NN") len=length(lev) #print(lev) #x1=ifelse(x=="NN",NA,ifelse(x==lev[1],0,ifelse(x==lev[2],1,2))) #1 or less status if(len<=1)x=0 #2 status if(len==2)x=ifelse(x=="NN",1,ifelse(x==lev[1],0,2)) #3 status if(len==3)x=ifelse(x=="NN",1,ifelse(x==lev[1],0,ifelse(x==lev[2],1,2))) # more than 3 status if(len> 3){ x=0 warning("Existing none biallelic SNP!") } return(matrix(x,n,1)) }#end of GAPIT.Numericalization2.Add function ############################################################################################## GAPIT.Numericalization2.Dom <- function(x){ #Object: To convert character SNP genotpe to numerical #Output: Coresponding numerical value #Authors: Feng Tian and Zhiwu Zhang # Last update: May 30, 2011 ############################################################################################## lev=levels(as.factor(x)) lev=setdiff(lev,"NN") len=length(lev) #print(lev) #x1=ifelse(x=="NN",NA,ifelse(x==lev[1],0,ifelse(x==lev[2],1,2))) #1 or less status if(len<=1)x1=0 # more than 3 status if(len> 3){ x1=0 warning("Existing none biallelic SNP!") } #2 status if(len==2|len==3)x1=ifelse(x=="NN",1,ifelse(substr(x,1,1)==substr(x,2,2),0,2)) return(matrix(x1,length(x),1)) }#end of GAPIT.Numericalization2.Dom function ############################################################################################## GAPIT.HapMap <- function(G,model="Add"){ #Object: To convert character SNP genotpe to numerical #Output: Coresponding numerical value #Authors: Feng Tian and Zhiwu Zhang # Last update: May 30, 2011 ############################################################################################## gc() GAPIT.Memory.Object(name.of.trait="HapMap.Start") #GT=data.frame(G[1,-(1:11)]) GT= t(G[1,-(1:11)]) GI= G[-1,c(1,3,4)] bit=nchar(as.character(G[2,12])) #to determine number of bits of genotype print(paste("Number of bits for genotype: ", bit)) if(bit==2){ if(model=="Add")GD= apply(G[-1,-(1:11)],1,GAPIT.Numericalization2.Add) if(model!="Add")GD= apply(G[-1,-(1:11)],1,GAPIT.Numericalization2.Dom) } if(bit==1){ if(model=="Add")GD= apply(G[-1,-(1:11)],1,GAPIT.Numericalization1.Add) if(model!="Add")GD= apply(G[-1,-(1:11)],1,GAPIT.Numericalization1.Dom) } colnames(GT)="taxa" colnames(GI)=c("SNP","Chromosome","Position") GAPIT.Memory.Object(name.of.trait="HapMap.Finished") return(list(GT=GT,GD=GD,GI=GI)) }#end of GAPIT.HapMap function ############################################################################################## GAPIT.CVMergePC<- function(X,Y){ #Object: To convert character SNP genotpe to numerical #Output: Coresponding numerical value #Authors: Feng Tian and Zhiwu Zhang # Last update: May 30, 2011 ############################################################################################## #Z=X+Y Z <- merge(X, Y, by.x = colnames(X)[1], by.y = colnames(Y)[1]) return(Z) }#end of GAPIT.CVMergePCfunction ############################################################################################## GAPIT.Memory <- function(Memory =NULL,Infor){ #Object: To report memory usage #Output: Memory #Authors: Zhiwu Zhang # Last update: June 6, 2011 ############################################################################################## gc() size <- memory.size() #print(paste("Memory usage: ",size," for", Infor)) if(is.null(Memory)) { Increased=0 Memory =cbind(Infor,size ,Increased) }else{ Increased=0 Memory.current=cbind(Infor,size ,Increased) Memory=rbind(Memory,Memory.current) Memory[nrow(Memory),3]=as.numeric(as.matrix(Memory[nrow(Memory),2]))-as.numeric(as.matrix(Memory[nrow(Memory)-1,2])) } return (Memory) }#end of GAPIT.Memory function ############################################################################################## GAPIT.Memory.Object <- function(name.of.trait="Trait"){ # Object: To report memoery usage # Authors: Heuristic Andrew # http://heuristically.wordpress.com/2010/01/04/r-memory-usage-statistics-variable/ # Modified by Zhiwu Zhang # Last update: may 29, 2011 ############################################################################################## # print aggregate memory usage statistics print(paste('R is using', memory.size(), 'MB out of limit', memory.limit(), 'MB')) # create function to return matrix of memory consumption object.sizes <- function() { return(rev(sort(sapply(ls(envir=.GlobalEnv), function (object.name) object.size(get(object.name)))))) } # export file in table format memory=object.sizes() file=paste("GAPIT.", name.of.trait,".Memory.Object.csv" ,sep = "") write.table(memory, file, quote = FALSE, sep = ",", row.names = TRUE,col.names = TRUE) # export file in PDF format pdf(paste("GAPIT.", name.of.trait,".Memory.Object.pdf" ,sep = "")) # draw bar plot barplot(object.sizes(), main="Memory usage by object", ylab="Bytes", xlab="Variable name", col=heat.colors(length(object.sizes()))) # draw dot chart dotchart(object.sizes(), main="Memory usage by object", xlab="Bytes") # draw pie chart pie(object.sizes(), main="Memory usage by object") dev.off() } ############################################################################################## GAPIT.Timmer <- function(Timmer=NULL,Infor){ #Object: To report current time #Output: Timmer #Authors: Zhiwu Zhang # Last update: may 8, 2011 ############################################################################################## Time<- Sys.time() if(is.null(Timmer)) { Elapsed=0 Timmer=cbind(Infor,Time,Elapsed) }else{ Elapsed=0 Timmer.current=cbind(Infor,Time,Elapsed) Timmer=rbind(Timmer,Timmer.current) Timmer[nrow(Timmer),3]=as.numeric(as.matrix(Timmer[nrow(Timmer),2]))-as.numeric(as.matrix(Timmer[nrow(Timmer)-1,2])) } #print(paste('Time used: ', Timmer[nrow(Timmer),3], ' seconds for ',Infor,sep="" )) return (Timmer) }#end of GAPIT.EMMAxP3D function ############################################################################################## GAPIT.Log <- function(Y=Y,KI=KI,Z=Z,CV=CV,emmaXp3d=emmaXp3d, groupFrom = groupFrom ,groupTo =groupTo ,groupBy = groupBy ,CA = CA, KT= KT, ngrid = ngrid , llin = llin , ulim = ulim , esp = esp ,name.of.trait = name.of.trait){ #Object: To report model factors #Output: Text file (GAPIT.Log.txt) #Authors: Zhiwu Zhang # Last update: may 16, 2011 ############################################################################################## #Creat storage facto <- list(NULL) value <- list(NULL) #collecting model factors facto[[1]]="Trait" value[[1]]=paste(dim(Y)) facto[[2]]="groupBy " value[[2]]=groupBy facto[[3]]="Trait name " value[[3]]=name.of.trait facto[[4]]="Kinship" value[[4]]=dim(KI) facto[[5]]="Z Matrix" value[[5]]=dim(Z) facto[[6]]="Covariate" value[[6]]=dim(CV) facto[[7]]="EMMAxP3D" value[[7]]=emmaXp3d facto[[8]]="Clustering algorithms" value[[8]]=CA facto[[9]]="Group kinship" value[[9]]=KT facto[[10]]="groupFrom " value[[10]]=groupFrom facto[[11]]="groupTo " value[[11]]=groupTo theLog=as.matrix(cbind(facto,value)) #theLog=as.character(as.matrix(cbind(facto,value))) colnames(theLog)=c("Model", "Value") file=paste("GAPIT.", name.of.trait,".Log.csv" ,sep = "") write.table(theLog, file, quote = FALSE, sep = ",", row.names = FALSE,col.names = TRUE) return (theLog) } ############################################################################################## GAPIT.Perform.BH.FDR.Multiple.Correction.Procedure <- function(PWI = PWI, FDR.Rate = 0.05, FDR.Procedure = "BH"){ #Object: Conduct the Benjamini-Hochberg FDR-Controlling Procedure #Output: PWIP, number.of.significant.SNPs #Authors: Alex Lipka and Zhiwu Zhang # Last update: May 5, 2011 ############################################################################################## #Make sure that your compouter has the latest version of Bioconductor (the "Biobase" package) and multtest if(is.null(PWI)) { PWIP=NULL number.of.significant.SNPs = 0 } if(!is.null(PWI)) { #library(multtest) if(dim(PWI)[1] == 1){ PWIP <- cbind(PWI, PWI[4]) colnames(PWIP)[5] <- "FDR_Adjusted_P-values" } if(dim(PWI)[1] > 1){ #mt.rawp2adjp Performs the Simes procedure. The output should be two columns, Left column: originial p-value #Right column: Simes corrected p-value res <- mt.rawp2adjp(PWI[,4], FDR.Procedure) #This command should order the p-values in the order of the SNPs in the data set adjp <- res$adjp[order(res$index), ] #round(adjp[1:7,],4) #Logical statment: 0, if Ho is not rejected; 1, if Ho is rejected, by the Simes corrected p-value temp <- mt.reject(adjp[,2], FDR.Rate) #Lists all number of SNPs that were rejected by the BY procedure #temp$r #Attach the FDR adjusted p-values to AS_Results PWIP <- cbind(PWI, adjp[,2]) #Sort these data by lowest to highest FDR adjusted p-value PWIP <- PWIP[order(PWIP[,4]),] colnames(PWIP)[7] <- "FDR_Adjusted_P-values" number.of.significant.SNPs = temp$r } #print("GAPIT.Perform.BH.FDR.Multiple.Correction.Procedure accomplished successfully!") } return(list(PWIP=PWIP, number.of.significant.SNPs = number.of.significant.SNPs)) } ############################################################################################## GAPIT.Pruning <- function(values,numSlots=20){ #Object: To get index of subset that evenly distribute #Output: Index #Authors: Zhiwu Zhang # Last update: May 28, 2011 ############################################################################################## #values= log.P.values values=sqrt(values) #This shift the weight a little bit to the low building. theMin=min(values) theMax=max(values) range=theMax-theMin interval=range/numSlots ladder=round(values/interval) ladder2=c(ladder[-1],0) keep=ladder-ladder2 index=which(keep>0) return(index) }#end of GAPIT.Pruning ############################################################################################## GAPIT.QQ <- function(P.values, plot.type = "log_P_values", name.of.trait = "Trait",numSlots=1000){ #Object: Make a QQ-Plot of the P-values #Options for plot.type = "log_P_values" and "P_values" #Output: A pdf of the QQ-plot #Authors: Alex Lipka and Zhiwu Zhang # Last update: May 9, 2011 ############################################################################################## # Sort the data by the raw P-values P.values <- P.values[order(P.values)] #Set up the p-value quantiles p_value_quantiles <- (1:length(P.values))/(length(P.values)+1) if(plot.type == "log_P_values") { log.P.values <- -log10(P.values) log.Quantiles <- -log10(p_value_quantiles) index=GAPIT.Pruning(log.P.values,numSlots=numSlots) log.P.values=log.P.values[index ] log.Quantiles=log.Quantiles[index] pdf(paste("GAPIT.", name.of.trait,".QQ-Plot.pdf" ,sep = "")) par(mar = c(5,5,5,5)) qqplot(log.Quantiles, log.P.values, xlim = c(0,max(log.Quantiles)), ylim = c(0,max(log.P.values)), cex.axis=1.5, cex.lab=2, lty = 1, lwd = 1, col = "Blue" ,xlab =expression(Expected~~-log[10](italic(p))), ylab = expression(Observed~~-log[10](italic(p))), main = paste(name.of.trait,sep=" ")) abline(a = 0, b = 1, col = "red") dev.off() } if(plot.type == "P_values") { pdf(paste("QQ-Plot_", name.of.trait,".pdf" ,sep = "")) par(mar = c(5,5,5,5)) qqplot(p_value_quantiles, P.values, xlim = c(0,1), ylim = c(0,1), type = "l" , xlab = "Uniform[0,1] Theoretical Quantiles", lty = 1, lwd = 1, ylab = "Quantiles of P-values from GWAS", col = "Blue", main = paste(name.of.trait,sep=" ")) abline(a = 0, b = 1, col = "red") dev.off() } #print("GAPIT.QQ accomplished successfully!") } ############################################################################################## GAPIT.Manhattan <- function(GI.MP = NULL, name.of.trait = "Trait", plot.type = "Genomewise",numSlots=1000){ #Object: Make a Manhattan Plot #Options for plot.type = "Separate_Graph_for_Each_Chromosome" and "Same_Graph_for_Each_Chromosome" #Output: A pdf of the Manhattan Plot #Authors: Alex Lipka, Zhiwu Zhang, and Meng Li # Last update: May 10, 2011 ############################################################################################## #do nothing if null input if(is.null(GI.MP)) return GI.MP=matrix(as.numeric(as.matrix(GI.MP) ) ,nrow(GI.MP),ncol(GI.MP)) #Remove all SNPs that do not have a choromosome and bp position GI.MP <- GI.MP[!is.na(GI.MP[,1]),] GI.MP <- GI.MP[!is.na(GI.MP[,2]),] #Remove all SNPs that have P values above 0 GI.MP <- GI.MP[GI.MP[,3]>0,] #Replace P the -log10 of the P-values GI.MP[,3] <- -log10(GI.MP[,3]) y.lim <- ceiling(max(GI.MP[,3])) chm.to.analyze <- unique(GI.MP[,1]) chm.to.analyze=chm.to.analyze[order(chm.to.analyze)] numCHR= length(chm.to.analyze) #Chromosomewise plot if(plot.type == "Chromosomewise") { pdf(paste("GAPIT.", name.of.trait,".Manhattan-Plot.Chromosomewise.pdf" ,sep = ""), width = 10) par(mar = c(5,5,4,3), lab = c(8,5,7)) for(i in 1:numCHR) { subset=GI.MP[GI.MP[,1]==chm.to.analyze[i],] x0 <- as.numeric(subset[,2])/10^(8) y0 <- as.numeric(subset[,3]) order=order(y0,decreasing = TRUE) index0=GAPIT.Pruning(y0[order],numSlots=20000) index=order(index0) x=x0[index] y=y0[index] #color.vector <- subset(temp.par.data[,7], temp.par.data[,4] == i) plot(y~x,type="p", ylim=c(0,y.lim), xlim = c(min(x), max(x)), col = "navy", xlab = expression(Base~Pairs~(x10^-8)), ylab = "-Log Base 10 p-value", main = paste("Chm",chm.to.analyze[i],sep=" ")) } dev.off() } #Chromosomewise plot #Genomewise plot if(plot.type == "Genomewise") { GI.MP <- GI.MP[order(GI.MP[,2]),] GI.MP <- GI.MP[order(GI.MP[,1]),] color.vector <- rep(c("orangered","navyblue"),numCHR) ticks=NULL lastbase=0 #change base position to accumulatives for (i in chm.to.analyze) { index=(GI.MP[,1]==i) ticks <- c(ticks, lastbase+mean(GI.MP[index,2])) GI.MP[index,2]=GI.MP[index,2]+lastbase lastbase=max(GI.MP[index,2]) } x0 <- as.numeric(GI.MP[,2]) y0 <- as.numeric(GI.MP[,3]) z0 <- as.numeric(GI.MP[,1]) position=order(y0,decreasing = TRUE) index0=GAPIT.Pruning(y0[position],numSlots=numSlots) index=position[index0] x=x0[index] y=y0[index] z=z0[index] pdf(paste("GAPIT.", name.of.trait,".Manhattan-Plot.Genomewise.pdf" ,sep = ""), width = 10) par(mar = c(5,5,5,1)) plot(y~x,xlab=expression(Chromosome),ylab=expression(-log[10](italic(p))) , cex.lab=2,col=ifelse(z%%2==0,"orangered","navy"),axes=FALSE,type = "p",pch=20,main = paste(name.of.trait,sep=" ")) axis(1, at=ticks,cex.axis=1.5,labels=chm.to.analyze,tick=F) axis(2, at=1:y.lim,cex.axis=1.5,labels=1:y.lim,tick=F) box() dev.off() } #Genomewise plot #print("GAPIT.Manhattan accomplished successfully!") } #end of GAPIT.Manhattan ############################################################################################## GAPIT.Filter.SNPs.with.Low.MAFs <- function(GI = GI, P.values = P.values, maf = maf, nobs = nobs, mafRate = 0.00){ #Object: Filter out SNPs with MAFs less than a user-defined threshold. #Output: PWI.Filtered #Authors: Alex Lipka, Zhiwu Zhang # Last update: May 6, 2011 ############################################################################################## if(is.null(GI)) PWI.Filtered=NULL #Merge if(!is.null(GI)) { PWI.with.MAFs <- cbind(GI,P.values, maf, nobs) colnames(PWI.with.MAFs [c(5,6)]) <- c("MAF", "N") #Filter out SNPs with MAF less than the threshold PWI.Filtered <- PWI.with.MAFs[which(PWI.with.MAFs[,5] >= mafRate), ] } #print("GAPIT.Filter.SNPs.with.Low.MAFs accomplished successfully!") return(PWI.Filtered) } ############################################################################################## GAPIT.Compression.Visualization <- function(Compression = Compression, name.of.trait = name.of.trait){ #Object: Conduct the Benjamini-Hochberg FDR-Controlling Procedure #Output: Three pdfs: One of the log likelihood function, one of the genetic and error variance component, # and one of the heritabilities #Authors: Alex Lipka and Zhiwu Zhang # Last update: May 10, 2011 ############################################################################################## #Graph the optimum compression if(length(Compression)<=6) Compression=t(as.matrix(Compression[which(Compression[,4]!="NULL"&Compression[,4]!="NaN"),])) if(length(Compression)>6) Compression=Compression[which(Compression[,4]!="NULL"&Compression[,4]!="NaN"),] LL=as.numeric(Compression[,4]) Compression.best=Compression[which(LL==min(LL)),] if(length(Compression.best)>6) Compression.best=Compression.best[1,] #Keep the first if multiple combinations variance=as.numeric(Compression.best[5:6]) colors <- c("grey50","grey70") labels0 <- round(variance/sum(variance) * 100, 1) labels <- paste(labels0, "%", sep="") LL.best0=as.numeric(Compression.best[4] ) LL.best=floor(LL.best0*100)/100 theOptimum=paste(c(Compression.best[c(1:3)],LL.best) ) pdf(paste("GAPIT.", name.of.trait,".Optimum.pdf", sep = ""), width = 14) par(mfrow = c(1,1), mar = c(1,1,5,5), lab = c(5,5,7)) pie(variance, col=colors, labels=labels,angle=45) legend(1.0, 0.5, c("Genetic varaince","Residual varaiance"), cex=1.5, fill=colors) #Display the optimum compression text(1.5,.0, "The optimum compression", col= "red") for(i in 1:4){ text(1.5,-.1*i, theOptimum[i], col= "red") } dev.off() #Graph compression with multiple groups if(length(unique(Compression[,3]))>1) { #Create a vector of colors color.vector.basic <- c("red","blue","black", "blueviolet","indianred","cadetblue","orange") color.vector.addition <- setdiff(c(colors()[grep("red",colors())], colors()[grep("blue",colors())]),color.vector.basic ) color.vector.addition.mixed <- sample(color.vector.addition,max(0,((length(unique(Compression[,1])) * length(unique(Compression[,2])))-length(color.vector.basic)))) color.vector <- c(color.vector.basic,color.vector.addition.mixed ) #Create a vector of numbers for the line dot types line.vector <- rep(1:(length(unique(Compression[,1])) * length(unique(Compression[,2])))) #We want to have a total of three plots, one displaying the likelihood function, one displaying the variance components, and one displaying the # heritability pdf(paste("GAPIT.", name.of.trait,".Compression.multiple.group.", ".pdf", sep = ""), width = 14) par(mfrow = c(2,3), mar = c(5,5,1,1), lab = c(5,5,7)) # Make the likelihood function plot k <- 1 for(i in 1:length(unique(Compression[,1]))){ for(j in 1:length(unique(Compression[,2]))){ if((i == 1)&(j == 1)) { Compression.subset <- Compression[which( (Compression[,1] == as.character(unique(Compression[,1])[i])) & (Compression[,2] == as.character(unique(Compression[,2])[j])) ), ] x <- as.numeric(Compression.subset[,3]) y <- as.numeric(Compression.subset[,4]) plot(y~x,type="l", pch = 30, lty = line.vector[i], ylim=c(min(as.numeric(Compression[,4])),max(as.numeric(Compression[,4]))), xlim = c(min(as.numeric(Compression[,3])),max(as.numeric(Compression[,3]))), col = color.vector[j], xlab = "Number of Groups", ylab = "-2Log Likelihoood", ) label = paste(c(as.character(unique(Compression[,1]))[k]," ",as.character(unique(Compression[,2]))[j]), collapse = "") } if((i != 1)|(j != 1)) { k <- k+1 Compression.subset <- Compression[which( (Compression[,1] == as.character(unique(Compression[,1])[i])) & (Compression[,2] == as.character(unique(Compression[,2])[j])) ), ] x <- as.numeric(Compression.subset[,3]) y <- as.numeric(Compression.subset[,4]) lines(y~x,type="l", pch = 30, lty = line.vector[i], col = color.vector[j]) label = c(label, paste(c(as.character(unique(Compression[,1]))[i]," ",as.character(unique(Compression[,2]))[j]), collapse = "")) } } } #Make a legend #legend("topright", label, fill = color.vector) # Make the genetic variance component plots k <- 1 for(i in 1:length(unique(Compression[,1]))){ for(j in 1:length(unique(Compression[,2]))){ if((i == 1)&(j == 1)) { Compression.subset <- Compression[which( (Compression[,1] == as.character(unique(Compression[,1])[i])) & (Compression[,2] == as.character(unique(Compression[,2])[j])) ), ] x <- as.numeric(Compression.subset[,3]) y <- as.numeric(Compression.subset[,5]) plot(y~x,type="l", pch = 17, lty = line.vector[i], ylim=c(min(as.numeric(Compression[,5])),max(as.numeric(Compression[,5]))), xlim = c(min(as.numeric(Compression[,3])),max(as.numeric(Compression[,3]))), col = color.vector[j], xlab = "Number of Groups", ylab = "Genetic Variance", ) #label = paste(c(as.character(unique(Compression[,1]))[i]," ",as.character(unique(Compression[,2]))[j]), collapse = "") } if((i != 1)|(j != 1)) { k <- k+1 Compression.subset <- Compression[which( (Compression[,1] == as.character(unique(Compression[,1])[i])) & (Compression[,2] == as.character(unique(Compression[,2])[j])) ), ] x <- as.numeric(Compression.subset[,3]) y <- as.numeric(Compression.subset[,5]) lines(y~x,type="l", pch = 17, lty = line.vector[i], col = color.vector[j]) #label = c(label, paste(c(as.character(unique(Compression[,1]))[i]," ",as.character(unique(Compression[,2]))[j]), collapse = "")) } } } #Make a legend #legend("topleft", label, fill = color.vector) # Make the residual variance component plots k <- 1 for(i in 1:length(unique(Compression[,1]))){ for(j in 1:length(unique(Compression[,2]))){ if((i == 1)&(j == 1)) { Compression.subset <- Compression[which( (Compression[,1] == as.character(unique(Compression[,1])[i])) & (Compression[,2] == as.character(unique(Compression[,2])[j])) ), ] x <- as.numeric(Compression.subset[,3]) y <- as.numeric(Compression.subset[,6]) plot(y~x,type="l", pch = 17, ylim=c(min(as.numeric(Compression[,6])),max(as.numeric(Compression[,6]))), xlim = c(min(as.numeric(Compression[,3])),max(as.numeric(Compression[,3]))), col = color.vector[j], xlab = "Number of Groups", ylab = "Residual Variance", ) #label = paste(c(as.character(unique(Compression[,1]))[i]," ",as.character(unique(Compression[,2]))[j]), collapse = "") } if((i != 1)|(j != 1)) { k <- k+1 Compression.subset <- Compression[which( (Compression[,1] == as.character(unique(Compression[,1])[i])) & (Compression[,2] == as.character(unique(Compression[,2])[j])) ), ] x <- as.numeric(Compression.subset[,3]) y <- as.numeric(Compression.subset[,6]) lines(y~x,type="l", pch = 17, lty = line.vector[i], col = color.vector[j]) #label = c(label, paste(c(as.character(unique(Compression[,1]))[i]," ",as.character(unique(Compression[,2]))[j]), collapse = "")) } } } #Make a legend #legend("topright", label, fill = color.vector) #calculate total variance and h2 heritablilty.vector <- as.numeric(Compression[,5])/(as.numeric(Compression[,5]) + as.numeric(Compression[,6])) totalVariance.vector <- as.numeric(as.numeric(Compression[,5]) + as.numeric(Compression[,6])) Compression.h2 <- cbind(Compression, heritablilty.vector,totalVariance.vector) # Make the total variance component plots k <- 1 for(i in 1:length(unique(Compression.h2[,1]))){ for(j in 1:length(unique(Compression.h2[,2]))){ if((i == 1)&(j == 1)) { Compression.subset <- Compression.h2[which( (Compression.h2[,1] == as.character(unique(Compression.h2[,1])[i])) & (Compression.h2[,2] == as.character(unique(Compression.h2[,2])[j])) ), ] x <- as.numeric(Compression.subset[,3]) y <- as.numeric(Compression.subset[,8]) plot(y~x,type="l", pch = 17, lty = line.vector[k], ylim=c(min(as.numeric(Compression.h2[,8])),max(as.numeric(Compression.h2[,8]))), xlim = c(min(as.numeric(Compression.h2[,3])),max(as.numeric(Compression.h2[,3]))), col = color.vector[1], xlab = "Number of Groups", ylab = "Total Variance", ) #label = paste(c(as.character(unique(Compression[,1]))[i]," ",as.character(unique(Compression[,2]))[j]), collapse = "") } if((i != 1)|(j != 1)) { k <- k+1 Compression.subset <- Compression.h2[which( (Compression.h2[,1] == as.character(unique(Compression.h2[,1])[i])) & (Compression.h2[,2] == as.character(unique(Compression.h2[,2])[j])) ), ] x <- as.numeric(Compression.subset[,3]) y <- as.numeric(Compression.subset[,8]) lines(y~x,type="l", pch = 17, lty = line.vector[i], col = color.vector[j]) #label = c(label, paste(c(as.character(unique(Compression[,1]))[i]," ",as.character(unique(Compression[,2]))[j]), collapse = "")) } } } #Make a legend #legend("topright", label, fill = color.vector) # Make the heritability plots k <- 1 for(i in 1:length(unique(Compression[,1]))){ for(j in 1:length(unique(Compression[,2]))){ if((i == 1)&(j == 1)) { Compression.subset <- Compression.h2[which( (Compression.h2[,1] == as.character(unique(Compression.h2[,1])[i])) & (Compression.h2[,2] == as.character(unique(Compression.h2[,2])[j])) ), ] x <- as.numeric(Compression.subset[,3]) y <- as.numeric(Compression.subset[,7]) plot(y~x,type="l", pch = 17, lty = line.vector[k], ylim=c(min(as.numeric(Compression.h2[,7])),max(as.numeric(Compression.h2[,7]))), xlim = c(min(as.numeric(Compression.h2[,3])),max(as.numeric(Compression.h2[,3]))), col = color.vector[1], xlab = "Number of Groups", ylab = "Heritability", ) #label = paste(c(as.character(unique(Compression[,1]))[i]," ",as.character(unique(Compression[,2]))[j]), collapse = "") } if((i != 1)|(j != 1)) { k <- k+1 Compression.subset <- Compression.h2[which( (Compression.h2[,1] == as.character(unique(Compression.h2[,1])[i])) & (Compression.h2[,2] == as.character(unique(Compression.h2[,2])[j])) ), ] x <- as.numeric(Compression.subset[,3]) y <- as.numeric(Compression.subset[,7]) lines(y~x,type="l", lty = line.vector[i], pch = 17, col = color.vector[j]) #label = c(label, paste(c(as.character(unique(Compression[,1]))[i]," ",as.character(unique(Compression[,2]))[j]), collapse = "")) } } } #Make a legend #legend("topleft", label, fill = color.vector) legend.col= 1+floor(length(unique(Compression[,1])) * length(unique(Compression[,2]))/20) line.style=rep(1:length(unique(Compression[,1])), each = length(unique(Compression[,2]))) line.color=rep(1:length(unique(Compression[,2])), length(unique(Compression[,1]))) # Make labels plot(0~0,axes=FALSE,type="l",ylab = "",xlab = "",frame.plot=FALSE) legend("topleft", label, col = color.vector[line.color], lty = line.style, ncol=legend.col,horiz=FALSE) dev.off() }#end of Graph compression with multiple groups #Graph compression with single groups if(length(unique(Compression[,3]))==1& length(unique(Compression[,1]))*length(unique(Compression[,2]))>1) { #Graph the compression with only one group pdf(paste("GAPIT.Compression.single.group.", name.of.trait, ".pdf", sep = ""), width = 14) par(mfrow = c(2,2), mar = c(5,5,1,1), lab = c(5,5,7)) nkt=length(unique(Compression[,1])) nca=length(unique(Compression[,2])) kvr=rep(c(1:nkt),nca) kvc0=rep(c(1:nca),nkt) kvc=as.numeric(t(matrix(kvc0,nca,nkt))) kt.name=Compression[1:nkt,1] ca.index=((1:nca)-1)*nkt+1 ca.name=Compression[ca.index,2] KG<- t(tapply(as.numeric(Compression[,4]), list(kvr, kvc), mean)) colnames(KG)=kt.name barplot(as.matrix(KG), ylab= "-2 Log Likelihood",beside=TRUE, col=rainbow(length(unique(Compression[,2])))) KG<- t(tapply(as.numeric(Compression[,5]), list(kvr, kvc), mean)) colnames(KG)=kt.name barplot(as.matrix(KG), ylab= "Genetic varaince", beside=TRUE, col=rainbow(length(unique(Compression[,2])))) KG<- t(tapply(as.numeric(Compression[,6]), list(kvr, kvc), mean)) colnames(KG)=kt.name barplot(as.matrix(KG), ylab= "Residual varaince", beside=TRUE, col=rainbow(length(unique(Compression[,2])))) KG<- t(tapply(as.numeric(Compression[,5])/(as.numeric(Compression[,5])+as.numeric(Compression[,6])), list(kvr, kvc), mean)) colnames(KG)=kt.name barplot(as.matrix(KG), ylab= "Heritability", beside=TRUE, col=rainbow(length(unique(Compression[,2]))),ylim=c(0,1)) legend("topleft", paste(t(ca.name)), cex=0.8,bty="n", fill=rainbow(length(unique(Compression[,2]))),horiz=TRUE) dev.off() } #end of Graph compression with single groups #print("GAPIT.Compression.Visualization accomplished successfully!") }#GAPIT.Compression.Plots ends here ############################################################################################## GAPIT.Table <- function(final.table = final.table, name.of.trait = name.of.trait,FDR.Filter.Rate=1){ #Object: Make and export a table of summary information from GWAS #Output: A table summarizing GWAS results #Authors: Alex Lipka and Zhiwu Zhang # Last update: May 10, 2011 ############################################################################################## #Filter SNPs by FDR index=(final.table[,7]<=FDR.Filter.Rate) final.table=final.table[index,] #Export this summary table as an excel file write.table(final.table, paste("GAPIT.", name.of.trait, ".GWAS.Results.csv", sep = ""), quote = FALSE, sep = ",", row.names = FALSE,col.names = TRUE) #print("GAPIT.Table accomplished successfully!") } #GAPIT.Table ends here ############################################################################################## GAPIT.GS.Visualization <- function(gsBLUP = gsBLUP, BINS=BINS, name.of.trait = name.of.trait){ #Object: To build heat map to show distribution of BLUP and PEV #Output: pdf #Authors: Zhiwu Zhang # Last update: May 15, 2011 ############################################################################################## pdf(paste("GAPIT.", name.of.trait,".GS", ".pdf", sep = ""), width = 14) par(mfrow = c(1,1), mar = c(1,1,5,5), lab = c(5,5,7)) nBin=BINS BLUP= gsBLUP[,5] PEV = gsBLUP[,6] if(BLUP[1]=="NaN"){ warning ("It was not converged. BLUP was not created!") } if(BLUP[1]!="NaN") { range.BLUP=max(BLUP)-min(BLUP) range.PEV=max(PEV)-min(PEV) interval.BLUP=range.BLUP/nBin interval.PEV=range.PEV/nBin bin.BLUP=floor(BLUP/max(BLUP)*nBin)*max(BLUP)/nBin bin.PEV=floor(PEV/max(PEV)*nBin)*max(PEV)/nBin distinct.BLUP=unique(bin.BLUP) distinct.PEV=unique(bin.PEV) Position.BLUP=match(bin.BLUP,distinct.BLUP,nomatch = 0) Position.PEV=match(bin.PEV,distinct.PEV,nomatch = 0) value=matrix(1,length(Position.BLUP)) KG<- (tapply(as.numeric(value), list(Position.BLUP, Position.PEV), sum)) rownames(KG)=distinct.BLUP colnames(KG)=distinct.PEV length(unique(bin.BLUP)) length(unique(bin.PEV)) nba_heatmap <- heatmap(KG, Rowv=NA, Colv=NA, col = cm.colors(256), scale="column", margins=c(5,10)) dev.off() } #print("GAPIT.GS.Visualization accomplished successfully!") } #GAPIT.GS.Visualization ends here ############################################################################################## GAPIT.kinship.loiselle <- function(snps, method="additive", use="all") { # Object: To calculate the kinship matrix using the method of Loiselle et al. (1995) # Authors: Alex Lipka and Hyun Min Kang # Last update: May 31, 2011 ############################################################################################## #Number of SNP types that are 0s n0 <- sum(snps==0,na.rm=TRUE) #Number of heterozygote SNP types nh <- sum(snps==1,na.rm=TRUE) #Number of SNP types that are 1s n1 <- sum(snps==2,na.rm=TRUE) #Number of SNP types that are missing nNA <- sum(is.na(snps)) #Self explanatory dim(snps)[1]*dim(snps)[2] #stopifnot(n0+nh+n1+nNA == length(snps)) #Note that the two lines in if(method == "dominant") and if(method == "recessive") are found in #if(method == "additive"). Worry about this only if you have heterozygotes, which you do not. if ( method == "dominant" ) { flags <- matrix(as.double(colMeans(snps,na.rm=TRUE) > 1),ncol(snps),nrow(snps)) snps[!is.na(snps) && (snps == 1)] <- flags[!is.na(snps) && (snps == 1)] } else if ( method == "recessive" ) { flags <- matrix(as.double(colMeans(snps,na.rm=TRUE) < 1),ncol(snps),nrow(snps)) snps[!is.na(snps) && (snps == 1)] <- flags[!is.na(snps) && (snps == 1)] } else if ( ( method == "additive" ) && ( nh > 0 ) ) { dsnps <- snps rsnps <- snps flags <- matrix(as.double(colMeans(snps,na.rm=TRUE) > 1),ncol(snps),nrow(snps)) dsnps[!is.na(snps) && (snps==1)] <- flags[is.na(snps) && (snps==1)] flags <- matrix(as.double(colMeans(snps,na.rm=TRUE) < 1),ncol(snps),nrow(snps)) rsnps[!is.na(snps) && (snps==1)] <- flags[is.na(snps) && (snps==1)] snps <- cbind(dsnps,rsnps) } #mafs is a (# lines)x(# SNPs)x matrix. The rows mafs are identical, and the ij^th element is the average #allele frequency for the SNP in the j^th column. #if(use == "all") imputes missing SNP type values with the expected (average) allele frequency. if ( use == "all" ) { mafs <- matrix(colMeans(snps,na.rm=TRUE),ncol(snps),nrow(snps)) snps[is.na(snps)] <- mafs[is.na(snps)] } else if ( use == "complete.obs" ) { mafs <- matrix(colMeans(snps,na.rm=TRUE),ncol(snps),nrow(snps)) snps <- snps[colSums(is.na(snps))==0,] } mafs_comp <- 1-mafs snps_comp <- 1-snps n <- nrow(snps) K <- matrix(nrow=n,ncol=n) diag(K) <- 1 #Create the k term on page 1422 of Loiselle et al. (1995) missing <- rep(NA, dim(snps)[2]) for(i in 1:dim(snps)[2]) { missing[i] <- sum(is.na(snps[,i])) } for(i in 1:(n-1)) { for(j in (i+1):n) { Num_First_Term_1 <- (snps[i,]-mafs[i,])*(snps[j,]-mafs[j,]) Num_First_Term_2 <- (snps_comp[i,]-mafs_comp[i,])*(snps_comp[j,]-mafs_comp[j,]) First_Term <- sum(Num_First_Term_1)+sum(Num_First_Term_2) Num_Second_Term_1 <- mafs[,i]*(1-mafs[,i]) Num_Second_Term_2 <- mafs_comp[,i]*(1-mafs_comp[,i]) Num_Second_Term_Bias_Correction <- 1/((2*n)-missing - 1) Num_Second_Term <- Num_Second_Term_1 + Num_Second_Term_2 Second_Term <- sum(Num_Second_Term*Num_Second_Term_Bias_Correction) Third_Term <- sum(Num_Second_Term) f <- (First_Term + Second_Term)/Third_Term if(f <= 0){ K[i,j] <- 0 } else{ K[i,j] <- f } K[j,i] <- K[i,j] } } return(K) } ############################################################################################## GAPIT.PCA <- function(X,taxa, PC.number = min(ncol(X),nrow(X))){ # Object: Conduct a principal component analysis, and output the prinicpal components into the workspace, # a text file of the principal components, and a pdf of the scree plot # Authors: Alex Lipka and Hyun Min Kang # Last update: May 31, 2011 ############################################################################################## #Conduct the PCA PCA.X <- prcomp(X) #Create a Scree plot pdf("GAPIT.Scree.Plot.pdf", width = 12, height = 12) screeplot(PCA.X, type="lines") dev.off() #Write the PCs into a text file PCs <- cbind(taxa,as.data.frame(PCA.X$x[,1:PC.number])) #Remove duplicate PCs.unique <- unique(PCs[,1]) PCs <-PCs[match(PCs.unique, PCs[,1], nomatch = 0), ] write.table(PCs, "GAPIT.Principal.Components.csv", quote = FALSE, sep = ",", row.names = TRUE,col.names = TRUE) #Return the PCs return(PCs) } ############################################################################################## GAPIT.kinship.loiselle <- function(snps, method="additive", use="all") { # Object: To calculate the kinship matrix using the method of Loiselle et al. (1995) # Authors: Alex Lipka and Hyun Min Kang # Last update: May 31, 2011 ############################################################################################## #Number of SNP types that are 0s n0 <- sum(snps==0,na.rm=TRUE) #Number of heterozygote SNP types nh <- sum(snps==0.5,na.rm=TRUE) #Number of SNP types that are 1s n1 <- sum(snps==1,na.rm=TRUE) #Number of SNP types that are missing nNA <- sum(is.na(snps)) #Self explanatory dim(snps)[1]*dim(snps)[2] #stopifnot(n0+nh+n1+nNA == length(snps)) #Note that the two lines in if(method == "dominant") and if(method == "recessive") are found in #if(method == "additive"). Worry about this only if you have heterozygotes, which you do not. if ( method == "dominant" ) { flags <- matrix(as.double(rowMeans(snps,na.rm=TRUE) > 0.5),nrow(snps),ncol(snps)) snps[!is.na(snps) && (snps == 0.5)] <- flags[!is.na(snps) && (snps == 0.5)] } else if ( method == "recessive" ) { flags <- matrix(as.double(rowMeans(snps,na.rm=TRUE) < 0.5),nrow(snps),ncol(snps)) snps[!is.na(snps) && (snps == 0.5)] <- flags[!is.na(snps) && (snps == 0.5)] } else if ( ( method == "additive" ) && ( nh > 0 ) ) { dsnps <- snps rsnps <- snps flags <- matrix(as.double(rowMeans(snps,na.rm=TRUE) > 0.5),nrow(snps),ncol(snps)) dsnps[!is.na(snps) && (snps==0.5)] <- flags[is.na(snps) && (snps==0.5)] flags <- matrix(as.double(rowMeans(snps,na.rm=TRUE) < 0.5),nrow(snps),ncol(snps)) rsnps[!is.na(snps) && (snps==0.5)] <- flags[is.na(snps) && (snps==0.5)] snps <- rbind(dsnps,rsnps) } #mafs is a (# SNPs)x(# lines) matrix. The columns of mafs are identical, and the ij^th element is the average #allele frequency for the SNP in the i^th row. #if(use == "all") imputes missing SNP type values with the expected (average) allele frequency. if ( use == "all" ) { mafs <- matrix(rowMeans(snps,na.rm=TRUE),nrow(snps),ncol(snps)) snps[is.na(snps)] <- mafs[is.na(snps)] } else if ( use == "complete.obs" ) { mafs <- matrix(rowMeans(snps,na.rm=TRUE),nrow(snps),ncol(snps)) snps <- snps[rowSums(is.na(snps))==0,] } mafs_comp <- 1-mafs snps_comp <- 1-snps n <- ncol(snps) K <- matrix(nrow=n,ncol=n) diag(K) <- 1 #Create the k term on page 1422 of Loiselle et al. (1995) missing <- rep(NA, dim(snps)[1]) for(i in 1:dim(snps)[1]) { missing[i] <- sum(is.na(snps[i,])) } for(i in 1:(n-1)) { for(j in (i+1):n) { Num_First_Term_1 <- (snps[,i]-mafs[,i])*(snps[,j]-mafs[,j]) Num_First_Term_2 <- (snps_comp[,i]-mafs_comp[,i])*(snps_comp[,j]-mafs_comp[,j]) First_Term <- sum(Num_First_Term_1)+sum(Num_First_Term_2) Num_Second_Term_1 <- mafs[,i]*(1-mafs[,i]) Num_Second_Term_2 <- mafs_comp[,i]*(1-mafs_comp[,i]) Num_Second_Term_Bias_Correction <- 1/((2*n)-missing - 1) Num_Second_Term <- Num_Second_Term_1 + Num_Second_Term_2 Second_Term <- sum(Num_Second_Term*Num_Second_Term_Bias_Correction) Third_Term <- sum(Num_Second_Term) f <- (First_Term + Second_Term)/Third_Term K[i,j] <- f if(K[i,j]<0) K[i,j]=0 K[j,i] <- K[i,j] } } return(K) } ############################################################################################## GAPIT.replaceNaN <- function(LL) { #handler of grids with NaN log #Authors: Zhiwu Zhang # Last update: may 12, 2011 ############################################################################################## #handler of grids with NaN log index=(LL=="NaN") if(length(index)>0) theMin=min(LL[!index]) if(length(index)<1) theMin="NaN" LL[index]=theMin return(LL) } ################################################################################################################ emma.REMLE <- function(y, X, K, Z=NULL, ngrids=100, llim=-10, ulim=10, esp=1e-10, eig.L = NULL, eig.R = NULL) { # Authors: Hyun Min Kang # Modified (only one line) by Zhiwu Zhang to handle non-defined LL ("NaN") by replacing it with the worst LL. # Last update: June 8, 2011 ################################################################################################################ n <- length(y) t <- nrow(K) q <- ncol(X) # stopifnot(nrow(K) == t) stopifnot(ncol(K) == t) stopifnot(nrow(X) == n) if ( det(crossprod(X,X)) == 0 ) { warning("X is singular") return (list(REML=0,delta=0,ve=0,vg=0)) } if ( is.null(Z) ) { if ( is.null(eig.R) ) { eig.R <- emma.eigen.R.wo.Z(K,X) } etas <- crossprod(eig.R$vectors,y) logdelta <- (0:ngrids)/ngrids*(ulim-llim)+llim m <- length(logdelta) delta <- exp(logdelta) Lambdas <- matrix(eig.R$values,n-q,m) + matrix(delta,n-q,m,byrow=TRUE) Etasq <- matrix(etas*etas,n-q,m) LL <- 0.5*((n-q)*(log((n-q)/(2*pi))-1-log(colSums(Etasq/Lambdas)))-colSums(log(Lambdas))) dLL <- 0.5*delta*((n-q)*colSums(Etasq/(Lambdas*Lambdas))/colSums(Etasq/Lambdas)-colSums(1/Lambdas)) optlogdelta <- vector(length=0) optLL <- vector(length=0) if ( dLL[1] < esp ) { optlogdelta <- append(optlogdelta, llim) optLL <- append(optLL, emma.delta.REML.LL.wo.Z(llim,eig.R$values,etas)) } if ( dLL[m-1] > 0-esp ) { optlogdelta <- append(optlogdelta, ulim) optLL <- append(optLL, emma.delta.REML.LL.wo.Z(ulim,eig.R$values,etas)) } for( i in 1:(m-1) ) { if ( ( dLL[i]*dLL[i+1] < 0 ) && ( dLL[i] > 0 ) && ( dLL[i+1] < 0 ) ) { r <- uniroot(emma.delta.REML.dLL.wo.Z, lower=logdelta[i], upper=logdelta[i+1], lambda=eig.R$values, etas=etas) optlogdelta <- append(optlogdelta, r$root) optLL <- append(optLL, emma.delta.REML.LL.wo.Z(r$root,eig.R$values, etas)) } } # optdelta <- exp(optlogdelta) } else { if ( is.null(eig.R) ) { eig.R <- emma.eigen.R.w.Z(Z,K,X) } etas <- crossprod(eig.R$vectors,y) etas.1 <- etas[1:(t-q)] etas.2 <- etas[(t-q+1):(n-q)] etas.2.sq <- sum(etas.2*etas.2) logdelta <- (0:ngrids)/ngrids*(ulim-llim)+llim m <- length(logdelta) delta <- exp(logdelta) Lambdas <- matrix(eig.R$values,t-q,m) + matrix(delta,t-q,m,byrow=TRUE) Etasq <- matrix(etas.1*etas.1,t-q,m) dLL <- 0.5*delta*((n-q)*(colSums(Etasq/(Lambdas*Lambdas))+etas.2.sq/(delta*delta))/(colSums(Etasq/Lambdas)+etas.2.sq/delta)-(colSums(1/Lambdas)+(n-t)/delta)) optlogdelta <- vector(length=0) optLL <- vector(length=0) if ( dLL[1] < esp ) { optlogdelta <- append(optlogdelta, llim) optLL <- append(optLL, emma.delta.REML.LL.w.Z(llim,eig.R$values,etas.1,n,t,etas.2.sq)) } if ( dLL[m-1] > 0-esp ) { optlogdelta <- append(optlogdelta, ulim) optLL <- append(optLL, emma.delta.REML.LL.w.Z(ulim,eig.R$values,etas.1,n,t,etas.2.sq)) } for( i in 1:(m-1) ) { if ( ( dLL[i]*dLL[i+1] < 0 ) && ( dLL[i] > 0 ) && ( dLL[i+1] < 0 ) ) { r <- uniroot(emma.delta.REML.dLL.w.Z, lower=logdelta[i], upper=logdelta[i+1], lambda=eig.R$values, etas.1=etas.1, n=n, t1=t, etas.2.sq = etas.2.sq ) optlogdelta <- append(optlogdelta, r$root) optLL <- append(optLL, emma.delta.REML.LL.w.Z(r$root,eig.R$values, etas.1, n, t, etas.2.sq )) } } # optdelta <- exp(optlogdelta) } maxdelta <- exp(optlogdelta[which.max(optLL)]) #handler of grids with NaN log optLL=GAPIT.replaceNaN(optLL) maxLL <- max(optLL) if ( is.null(Z) ) { maxva <- sum(etas*etas/(eig.R$values+maxdelta))/(n-q) } else { maxva <- (sum(etas.1*etas.1/(eig.R$values+maxdelta))+etas.2.sq/maxdelta)/(n-q) } maxve <- maxva*maxdelta return (list(REML=maxLL,delta=maxdelta,ve=maxve,vg=maxva)) } ############################################################################################## GAPIT.Sampler <- function(GD=NULL,dataPath=NULL,numFiles=1,GFile=NULL,GFileExt=NULL,seed=0,ratio=1,model="Add", genoFormat=NULL, GDFile=NULL, GDFileExt=NULL){ #Object: To sample genotype from multiple genotype files #Output: genotype data sampled #Authors: Alex Lipka and Zhiwu Zhang # Last update: June 11, 2011 ############################################################################################## #Take a random sample of SNPs from the first file. Output is "GD.Ssample" set.seed(seed) n= ncol(GD) sample=sample(1:n,floor(n*ratio)) GD.Sample=GD[,sample] rm(GD) gc() #Take a random sample of SNPs from the remaining files if (numFiles>1) { if(genoFormat=="Hapmap"){ for (file in 2:numFiles) { G <- read.table(paste(dataPath,GFile,file, ".",GFileExt,sep=""), head = FALSE) hm=GAPIT.HapMap(G,model=model) rm(G) gc() n= ncol(hm$GD) set.seed(seed+file) sample=sample(1:n,floor(n*ratio)) GD.Sample=cbind(GD.Sample,hm$GD[sample]) rm(hm) gc() } # end of multiple genotype files } #end of numfile>1 if(genoFormat=="EMMA"){ for (file in 2:numFiles) { GD <- read.table(paste(dataPath,GDFile,file, ".",GDFileExt,sep=""), head = TRUE) GD=GD[,-1] n= ncol(GD) set.seed(seed+file) sample=sample(1:n,floor(n*ratio)) GD.Sample=cbind(GD.Sample,GD[sample]) rm(GD) gc() } # end of multiple genotype files } #end of numfile>1 } return(GD.Sample) } ############################################################################################## GAPIT.Genotype <- function(G=NULL,GD=NULL,GM=NULL,KI=NULL,kinMethod="VanRaden",numPCs=0, dataPath=NULL,numFiles=0,GFile=NULL, GFileExt=NULL,GDFile=NULL, GMFile=NULL, GDFileExt=NULL,GMFileExt=NULL, mafRate=0.05,FDR.Rate = 0.05,FDR.Filter.Rate=1,model="Add",Timmer=NULL,Memory=NULL){ #Object: To unify genotype and calculate kinship and PC if required #Output: GD, GI, GT, KI and PC #Authors: Zhiwu Zhang #Last update: August 11, 2011 ############################################################################################## seed=123 ratio=1 if(is.null(dataPath) & (is.null(G) & is.null(GD)) & (!is.null(GFile) | !is.null(GDFile))) stop("A path for genotype data should be provided!") if(is.null(dataPath) & numFiles>1) stop(paste("A path for genotype data should be provided for the ",numFiles, " files specified by numFiles!")) flag.single.vs.multiple=FALSE if(!is.null(G)&numFiles>1) flag.single.vs.multiple=TRUE if(!is.null(GD)&numFiles>1) flag.single.vs.multiple=TRUE if(flag.single.vs.multiple) stop("Please input genotype data in either a singel file or use multiple file option!") #Multiple genotype files if(is.null(numFiles))numFiles=1 if(numFiles==0)numFiles=1 hasGenotype=(!is.null(G) | !is.null(GD) | !is.null(dataPath) ) print("Debug: reading G for first file") if(is.null(G) & !is.null(GFile) ) G <- read.table(paste(dataPath,GFile, "1.",GFileExt,sep=""), head = FALSE) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="Genotype loaded)") Memory=GAPIT.Memory(Memory=Memory,Infor="Genotype loaded") if(!is.null(G)) genoFormat="Hapmap" if(is.null(G)) genoFormat="EMMA" #Rename GM as GI if(!is.null(GM))GI=GM if(is.null(GD) & !is.null(GDFile) & !is.null(GMFile)) { GI <- read.table(paste(dataPath,GMFile, "1.",GMFileExt,sep=""), head = TRUE) GD <- read.table(paste(dataPath,GDFile, "1.",GDFileExt,sep=""), head = TRUE) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="Genotype data loaded)") Memory=GAPIT.Memory(Memory=Memory,Infor="Genotype data loaded") } if(!is.null(GD) ) { GT=as.matrix(GD[,1]) #get taxa GD=GD[,-1] #remove taxa column Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="GT created from GD)") Memory=GAPIT.Memory(Memory=Memory,Infor="GT created from GD") } if(!is.null(G) & !is.null(GD)) stop("Both hapmap and EMMA format exist, choose one only.") if(!is.null(G)) { #Convert HapMap to numerical print(paste("Converting genotype...",sep="")) hm=GAPIT.HapMap(G,model=model) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="HapMap") Memory=GAPIT.Memory(Memory=Memory,Infor="HapMap") print(paste("Converting genotype done.",sep="")) rm(G) gc() Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="G removed") Memory=GAPIT.Memory(Memory=Memory,Infor="G removed") GT=hm$GT GD=hm$GD GI=hm$GI rm(hm) gc() Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="hm removed") Memory=GAPIT.Memory(Memory=Memory,Infor="hm removed") } #Handler of PCA and Kinship input on multiple geneotype files if(is.null(KI) | numPCs>0) GD.Sample=GAPIT.Sampler(GD=GD,dataPath=dataPath,numFiles=numFiles,GFile=GFile,GFileExt=GFileExt,seed=seed,ratio=ratio, model=model, genoFormat=genoFormat, GDFile=GDFile, GDFileExt=GDFileExt) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="Sampling genotype") Memory=GAPIT.Memory(Memory=Memory,Infor="Sampling genotype") #Create kinship from genotype if not provide if(is.null(KI)&!is.null(GD)) { print(dim(GD)) if(kinMethod=="EMMA")theKin= emma.kinship(snps=t(as.matrix(.5*GD.Sample)), method="additive", use="all") if(kinMethod=="Loiselle")theKin= GAPIT.kinship.loiselle(snps=t(as.matrix(.5*GD.Sample)), method="additive", use="all") if(kinMethod=="VanRaden")theKin= GAPIT.kinship.VanRaden(snps=GD.Sample) KI=cbind(GT,as.data.frame(theKin)) #This require big memory. Need a way to solve it. rm(theKin) gc() Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="Estimating kinship") Memory=GAPIT.Memory(Memory=Memory,Infor="Estimating kinship") } PC=NULL if(numPCs>0){ PC=GAPIT.PCA(X = GD, taxa = GT, PC.number = numPCs) Timmer=GAPIT.Timmer(Timmer=Timmer,Infor="PCA") Memory=GAPIT.Memory(Memory=Memory,Infor="PCA") } return (list(GD=GD,GI=GI,GT=GT,hasGenotype=hasGenotype,KI=KI,PC=PC,Timmer=Timmer,Memory=Memory)) } ############################################################################################## GAPIT.kinship.VanRaden<- function(snps,hasInbred=TRUE) { # Object: To calculate the kinship matrix using the method of VanRaden (2009, J. Dairy Sci. 91:4414?ˇěC4423) # Authors: Zhwiu Zhang # Last update: August 15, 2011 ############################################################################################## #snps=hm$GD nSNP=ncol(snps) nInd=nrow(snps) n=nInd snpMean= apply(snps,2,mean) snps=snps-snpMean K=tcrossprod((snps), (snps)) #Extract diagonals i=1:n j=(i-1)*n index=i+j d=K[index] DL=min(d) DU=max(d) floor=min(K) K=(K-floor)/(DL-floor) MD=(DU-floor)/(DL-floor) #Handler of diagonals over 2 if(MD>2)K[index]=K[index]/(MD-1)+1 #Handler of inbred if(MD<2 & hasInbred) K=2*K/((DU-floor)/(DL-floor)) return(K) }