# Crop 545 lab 9 - lec 12 structure & PCA
# code provided by Zhiwu Zhang
# modified and commented by Yuanhong Song

# 000000000000000000000000000000000000000000000000000000000000000
#
#                 0. Setup & prepare the analysis ------
#
# 000000000000000000000000000000000000000000000000000000000000000

## 0.1 Importing materials ----

# rm(list=ls())
# setwd()

source("http://zzlab.net/StaGen/2018/R/G2P.R")
source("http://zzlab.net/StaGen/2018/R/GWASbyCor.R")

myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)

X=myGD[,-1]


## 0.2 G2P simulation and GWAS ----

# You should be familiar with the following steps, and understand what 
# it does. Detailed comment is in  lab 7 - phenotype simulation

index1to5=myGM[,2]<6
X1to5 = X[,index1to5]
set.seed(99164)
mySim=G2P(X= X1to5,h2=.75,alpha=1,NQTN=10,distribution="norm")
p= GWASbyCor(X=X,y=mySim$y) #ignore error in this command

color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10)
m=nrow(myGM)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
# remember we only simulate phenotype from QTN on chromosome 1-5. Any positve identified
# on 6-10 should be false positive. Please notice that in the plot


## 1. Type I error ----

## 1.1 Observed and null p values ----

# Our null hypothesis - a QTN is not correlated with the phenotype.
index.null=!index1to5 & !is.na(p)
# index.null = on chromosome 1-6 and the p value is not NA
p.null=p[index.null]
# get p values for the index.null condition
m.null=length(p.null)

index.sort=order(p.null) 
# record the rank of p.null
p.null.sort=p.null[index.sort]
# note: the above two command is equivelent to the following
# p.null.sort = sort(p.null)

head(p.null.sort) # smallest p-values
tail(p.null.sort) # lagest p-values

# making a table of the sorted p-values in accending order:
seq=seq(1:m.null)
table=cbind(seq, p.null.sort, seq/m.null)
head(table,15)
tail(table)
# first column is an order index, second is the p-values, the third is quantile

# null distribution of p values
hist(p[!index1to5])
# recall: p is the p value for correlation test we got from GWAS

# The following create a qq plot for observed p-val against p-val under H-null:
# assume p-val under H-null follows a uniform distribution.
p.obs=p[!index1to5] # observed p values

# generate null p values
m2=length(p.obs) # we need the two p value arrays to match up with length, thus
                 # we use the length() function
p.uni=runif(m2,0,1) # get null p value from uniform distribution

order.obs=order(p.obs) # again, for the order() part and indexing, 
order.uni=order(p.uni) # you may also use a sort() function

# 1.2 Q-Q plots for p values ----
# Q-Q plot 1: for p-values
plot( (p.uni[order.uni]), (p.obs[order.obs]), 
      xlab = "p-null", ylab = "p-obs")
abline(a = 0, b = 1, col = "red")
# what do you see from this QQ plot?

# Q-Q plot 2: for -log(p)
plot(-log10(p.uni[order.uni]),-log10(p.obs[order.obs]),
     xlim=c(0,6), ylim=c(0,6), 
     xlab = "-log(p-null)", ylab = "-log(p-obs)")
abline(a = 0, b = 1, col = "red")
# The first and second Q-Q plot will give the same information
# By taking the log, we exaggerated the deviation pattern

# Note: abline is a useful plotting tool, you will use it more often than you thought
# we have been use it to label the positive QTN positions.
# ?abline()

plot(ecdf(-log10(p.obs)))

# 1.3 Type I error cutoff ----

# We will fix Type I error value:
type1=c(0.01, 0.05, 0.1, 0.2)        # set a sequence of type-I cutoff thresholds
(cutoff=quantile(p.obs,type1,na.rm=T)) # p-values at such thresholds 
(type1.p = data.frame(cbind(type1, cutoff)))
plot(type1.p$type1, type1.p$cutoff,type="b",
     xlab="Type I error", ylab="Cutoff p values")


# 2. PCA ----
# There's a literature that answers the following questions:
#  * Why perform PCA with the genomics data?
#  * How does PCA work (with genomics data)?
# Population Structure and Eigenanalysis, by Patterson et al. (PLoS Genetics 2006, 2(12))

# note: prcomp() requires the 'stat' library

# 2.1 Perform PCA analysis with prcomp() ----
# try pca with a small proportion of the dataset
pca=prcomp(X[,1:10])
str(pca)
# Eigen value: $sdev squaed
# Eigen vector: $rotation
# Principal component: $x

# run pca again with the whole dataset
PCA=prcomp(X)
str(PCA)

# 2.2 The prcomp() output ----
PCA$x[1:10,1:5] # the first 5 principle components

pcavar=PCA$sdev^2   # eigenvalues
proportion=pcavar/sum(pcavar) 
# the bigger the eigenvalues is, more variance the correspinding principal component
# can explain. 
# Thus we analysis the relative values of each eigenvalue, which one(s) contributes more?


par(mfrow=c(1,3),mar = c(3,4,1,1))
barplot(PCA$sdev[1:10])  # the square root of eigenvalues
barplot(pcavar[1:10])    # the eigenvalues
plot(proportion[1:10],type="b") # the relative 'importance' of the top eigenvalues

par(mfrow=c(1,1))
plot(PCA$x[,1],PCA$x[,2],col="red") # PC1 against PC2

# 2.3 PC vs. phenotype ----
par(mfrow=c(2,1),mar = c(3,4,1,1))
plot(mySim$y,PCA$x[,1]) # phenometric effect vs. PC1
cor(mySim$y,PCA$x[,1])
plot(mySim$y,PCA$x[,2])
cor(mySim$y,PCA$x[,2]) # phenometric effect vs. PC2

ther=cor(mySim$y,PCA$x[,])


# 2.4 PC vs. QTN's ----

# Run two PCA analysis with:
# 1. SNP's on chromosome 1-5
pca1to5=prcomp(X[,index1to5])
# 2. SNP's on chromosome 6-10
pca6to10=prcomp(X[,!index1to5])

par(mfrow=c(2,2), mar = c(3,4,1,1))
plot(mySim$y, pca1to5$x[,1]) # pheno vs PC1 in CHM1-5, upper-left
cor(mySim$y, pca1to5$x[,1])
plot(mySim$y, pca6to10$x[,1])# pheno vs PC1 in CHM2-6, upper-right
cor(mySim$y, pca6to10$x[,1])

plot(mySim$y, pca1to5$x[,2]) # pheno vs PC2 in CHM1-5, lower-left
cor(mySim$y, pca1to5$x[,2]) 
plot(mySim$y, pca6to10$x[,2])# pheno vs PC2 in CHM1-5, lower-right
cor(mySim$y, pca6to10$x[,2])

plot(-log10(p.uni[order.uni]),-log10(p.obs[order.obs]))
abline(a = 0, b = 1, col = "red")


# Another annotation for R prcomp() output
# * https://www.analyticsvidhya.com/blog/2016/03/practical-guide-principal-component-analysis-python/

# Other ways of perform PCA analysis with R
# * http://web.missouri.edu/~huangf/data/mvnotes/Documents/pca_in_r_2.html