# Crop 545 lab 9 - lec 12 structure & PCA # code provided by Zhiwu Zhang # modified and commented by Yuanhong Song # 000000000000000000000000000000000000000000000000000000000000000 # # 0. Setup & prepare the analysis ------ # # 000000000000000000000000000000000000000000000000000000000000000 ## 0.1 Importing materials ---- # rm(list=ls()) # setwd() source("http://zzlab.net/StaGen/2018/R/G2P.R") source("http://zzlab.net/StaGen/2018/R/GWASbyCor.R") myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T) myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T) X=myGD[,-1] ## 0.2 G2P simulation and GWAS ---- # You should be familiar with the following steps, and understand what # it does. Detailed comment is in lab 7 - phenotype simulation index1to5=myGM[,2]<6 X1to5 = X[,index1to5] set.seed(99164) mySim=G2P(X= X1to5,h2=.75,alpha=1,NQTN=10,distribution="norm") p= GWASbyCor(X=X,y=mySim$y) #ignore error in this command color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10) m=nrow(myGM) plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]]) abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black") # remember we only simulate phenotype from QTN on chromosome 1-5. Any positve identified # on 6-10 should be false positive. Please notice that in the plot ## 1. Type I error ---- ## 1.1 Observed and null p values ---- # Our null hypothesis - a QTN is not correlated with the phenotype. index.null=!index1to5 & !is.na(p) # index.null = on chromosome 1-6 and the p value is not NA p.null=p[index.null] # get p values for the index.null condition m.null=length(p.null) index.sort=order(p.null) # record the rank of p.null p.null.sort=p.null[index.sort] # note: the above two command is equivelent to the following # p.null.sort = sort(p.null) head(p.null.sort) # smallest p-values tail(p.null.sort) # lagest p-values # making a table of the sorted p-values in accending order: seq=seq(1:m.null) table=cbind(seq, p.null.sort, seq/m.null) head(table,15) tail(table) # first column is an order index, second is the p-values, the third is quantile # null distribution of p values hist(p[!index1to5]) # recall: p is the p value for correlation test we got from GWAS # The following create a qq plot for observed p-val against p-val under H-null: # assume p-val under H-null follows a uniform distribution. p.obs=p[!index1to5] # observed p values # generate null p values m2=length(p.obs) # we need the two p value arrays to match up with length, thus # we use the length() function p.uni=runif(m2,0,1) # get null p value from uniform distribution order.obs=order(p.obs) # again, for the order() part and indexing, order.uni=order(p.uni) # you may also use a sort() function # 1.2 Q-Q plots for p values ---- # Q-Q plot 1: for p-values plot( (p.uni[order.uni]), (p.obs[order.obs]), xlab = "p-null", ylab = "p-obs") abline(a = 0, b = 1, col = "red") # what do you see from this QQ plot? # Q-Q plot 2: for -log(p) plot(-log10(p.uni[order.uni]),-log10(p.obs[order.obs]), xlim=c(0,6), ylim=c(0,6), xlab = "-log(p-null)", ylab = "-log(p-obs)") abline(a = 0, b = 1, col = "red") # The first and second Q-Q plot will give the same information # By taking the log, we exaggerated the deviation pattern # Note: abline is a useful plotting tool, you will use it more often than you thought # we have been use it to label the positive QTN positions. # ?abline() plot(ecdf(-log10(p.obs))) # 1.3 Type I error cutoff ---- # We will fix Type I error value: type1=c(0.01, 0.05, 0.1, 0.2) # set a sequence of type-I cutoff thresholds (cutoff=quantile(p.obs,type1,na.rm=T)) # p-values at such thresholds (type1.p = data.frame(cbind(type1, cutoff))) plot(type1.p$type1, type1.p$cutoff,type="b", xlab="Type I error", ylab="Cutoff p values") # 2. PCA ---- # There's a literature that answers the following questions: # * Why perform PCA with the genomics data? # * How does PCA work (with genomics data)? # Population Structure and Eigenanalysis, by Patterson et al. (PLoS Genetics 2006, 2(12)) # note: prcomp() requires the 'stat' library # 2.1 Perform PCA analysis with prcomp() ---- # try pca with a small proportion of the dataset pca=prcomp(X[,1:10]) str(pca) # Eigen value: $sdev squaed # Eigen vector: $rotation # Principal component: $x # run pca again with the whole dataset PCA=prcomp(X) str(PCA) # 2.2 The prcomp() output ---- PCA$x[1:10,1:5] # the first 5 principle components pcavar=PCA$sdev^2 # eigenvalues proportion=pcavar/sum(pcavar) # the bigger the eigenvalues is, more variance the correspinding principal component # can explain. # Thus we analysis the relative values of each eigenvalue, which one(s) contributes more? par(mfrow=c(1,3),mar = c(3,4,1,1)) barplot(PCA$sdev[1:10]) # the square root of eigenvalues barplot(pcavar[1:10]) # the eigenvalues plot(proportion[1:10],type="b") # the relative 'importance' of the top eigenvalues par(mfrow=c(1,1)) plot(PCA$x[,1],PCA$x[,2],col="red") # PC1 against PC2 # 2.3 PC vs. phenotype ---- par(mfrow=c(2,1),mar = c(3,4,1,1)) plot(mySim$y,PCA$x[,1]) # phenometric effect vs. PC1 cor(mySim$y,PCA$x[,1]) plot(mySim$y,PCA$x[,2]) cor(mySim$y,PCA$x[,2]) # phenometric effect vs. PC2 ther=cor(mySim$y,PCA$x[,]) # 2.4 PC vs. QTN's ---- # Run two PCA analysis with: # 1. SNP's on chromosome 1-5 pca1to5=prcomp(X[,index1to5]) # 2. SNP's on chromosome 6-10 pca6to10=prcomp(X[,!index1to5]) par(mfrow=c(2,2), mar = c(3,4,1,1)) plot(mySim$y, pca1to5$x[,1]) # pheno vs PC1 in CHM1-5, upper-left cor(mySim$y, pca1to5$x[,1]) plot(mySim$y, pca6to10$x[,1])# pheno vs PC1 in CHM2-6, upper-right cor(mySim$y, pca6to10$x[,1]) plot(mySim$y, pca1to5$x[,2]) # pheno vs PC2 in CHM1-5, lower-left cor(mySim$y, pca1to5$x[,2]) plot(mySim$y, pca6to10$x[,2])# pheno vs PC2 in CHM1-5, lower-right cor(mySim$y, pca6to10$x[,2]) plot(-log10(p.uni[order.uni]),-log10(p.obs[order.obs])) abline(a = 0, b = 1, col = "red") # Another annotation for R prcomp() output # * https://www.analyticsvidhya.com/blog/2016/03/practical-guide-principal-component-analysis-python/ # Other ways of perform PCA analysis with R # * http://web.missouri.edu/~huangf/data/mvnotes/Documents/pca_in_r_2.html