# Crop 545 - lab 7
# An annotated script for Lec#8 - Lec#12 PCA
# In this script, section 8.15 meanse the code is provided in lecture 8, slide 15


## 000000000000000000000000000000000000000000
##
## Lec 9 Phynometric simulation & G2P -----
##
## 000000000000000000000000000000000000000000

# 9.1: phenometric distribution ----
# 
m=20 # number of gene 
n=100  # number of individual
x=runif(m) # x = a uniform r.v. with the length of m
gene=matrix(x,n,m,byrow = T) 
# gene = a matrix with n row and m col
# rows are repeats of the x we created above
head(gene) 
# The above create the matrix in 8.6 "assign to individuals"

# next we will assign True/False values to each elemant in 
# gene matrix by 50/50 chance. This is a review of how to 
# flip a coin with R. Please make sure you understand.
galton=matrix(runif(n*m),n,m)
# galton is a matrix full of uniform (0,1) r.v., with same
# dimension as 'gene'
head(galton)
galton.binary=galton<.5
head(galton.binary)
# the above generate the matrix in 8.6 "Uniform random variable"

gene[galton.binary]=0 # at the 'TRUE' location, the origrinal 'gene' value
# will be turned into zero
head(gene)
# till here, certain genes in some individuals are 'expressed', 
# in some are 'muted'. 

y=rowSums(gene) # sum up the total gene effect in each individual
y[1:6]
hist(y, freq=F); lines(density(y))

# wrap up into a function
phe.sim = function(m, n){
  x=runif(m)
  gene=matrix(x,n,m,byrow = T)
  galton=matrix(runif(n*m),n,m)
  gene[galton<.5]=0
  y=rowSums(gene)
  hist(y, freq=F, breaks=round(n/10), col="skyblue",lty="blank",
       main="Simulated Phenometric Distribution")
  lines(density(y), col="red")
}
phe.sim(m=1000, n=10000)

## 9.2 QTN position plot----
# the hapmap example data is a subset of the dataset downloaded from the Dryland website.
hapmapEX=read.table("hapmapEX.txt", header = T)
hapmapEX[1:4, 1:6]
# col3 = chrommosome
# col4 = position 
# It should be a standard in all Hapmap format
plot(hapmapEX[, c(3,4)], 
     col="skyblue", xlab="Chromosome", ylab="Position",
     cex.lab=0.7)

# the following steps randomly select some markers
source("hapmap2df.txt")
n=nrow(hapmapEX);m=ncol(hapmapEX)
NQTN=10  # define a number 
QTN.position=sample(n,NQTN,replace=F)
QTN.position
# randomly generate 10(NQTN) numbers from 1 to the number of m
SNPQ=as.matrix(hapmapEX[QTN.position,])
# extract the selected QTN 
head(SNPQ)
# note: this example uses hapmap file, in which the rows stands for markers and cols for 
# individual. This is different from the example given in lecture.


## 9.3 simulate phenotype ----
#Sampling QTN
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
NQTN=10
X=myGD[,-1]
n=nrow(X); m=ncol(X)
QTN.position=sample(m,NQTN,replace=F)
SNPQ=as.matrix(X[,QTN.position])
QTN.position
head(SNPQ)

# randomly simulate some additive effect.
# recall the add effect follows a normal distribution
addeffect=rnorm(NQTN,0,1)
effect=SNPQ%*%addeffect
# %*% stands for matrix multiplication
head(effect)

# descriptive plot of the effect
par(mfrow=c(2,2))
plot(effect, main="Scatter plot of effect")
hist(effect)
boxplot(effect, main="Boxplot of effect")
plot(ecdf(effect), main="Cumulative density of effect")

# assume an fixed h-sqaure score, and figure residual variance var(e)
h2=.7
effectvar=var(effect)
effectvar
residualvar=(effectvar-h2*effectvar)/h2 
# see the lecture for this equation
residualvar
residual=rnorm(n,0,sqrt(residualvar))
head(residual)

par(mfrow=c(2,2))
plot(residual,main="Scatter plot of residual")
hist(residual)
boxplot(residual, main="Boxplot of residual")
plot(ecdf(residual), main="Cumulative density of residual")

y=effect+residual
par(mfrow=c(2,2))
plot(y,main="Scatter plot of simulated phenotype")
hist(y,main="Histogram of simulated phenotype")
boxplot(y,main="Boxplot of simulated phenotype")
plot(ecdf(y),main="Cumulative density of simulated phenotype")

## 9.4 Heritability ----
# How much of variation in the phenotypic trait was due to genetic 
# variation
va=var(effect)
ve=var(residual)
vp=var(y)
v=matrix(c(va,ve,vp),1,3)
colnames(v)=c("A", "E","P") # assign names to columns

par(mfrow=c(1,2))
plot(density(y),ylim=c(0,.3), main="Density of Y, G, and e")
lines(density(effect),col="blue")
lines(density(residual),col="red")
barplot(v,col="gray")

# Correlations between y and effects
# Plot
par(mfrow=c(1,3))
plot(y,effect)
plot(y,residual)
plot(residual,effect)
cor(y,effect)


## 9.5 G2P Function ----
# wrapping the above into the G2P function
G2P=function(X,h2,alpha,NQTN,distribution){
  n=nrow(X)
  m=ncol(X)
  #Sampling QTN
  QTN.position=sample(m,NQTN,replace=F)
  SNPQ=as.matrix(X[,QTN.position])
  QTN.position
  #QTN effects
  if(distribution=="norm")
  {addeffect=rnorm(NQTN,0,1)
  }else
  {addeffect=alpha^(1:NQTN)}
  #Simulate phenotype
  effect=SNPQ%*%addeffect
  effectvar=var(effect)
  residualvar=(effectvar-h2*effectvar)/h2
  residual=rnorm(n,0,sqrt(residualvar))
  y=effect+residual
  return(list(addeffect = addeffect, 
              y=y, add = effect, 
              residual = residual, 
              QTN.position=QTN.position, 
              SNPQ=SNPQ))
}

## 000000000000000000000000000000000000000000
##
##         ---- Lec 10 GWASbyR -----
##
## 000000000000000000000000000000000000000000


## 10.1 corr. coef. and t distribution ----
cort=function(n=10000,df=100){
  z=replicate(n,{
    x=rnorm(df+2)
    y=rnorm(df+2)
    r=cor(x,y)
    t=r/sqrt((1-r^2)/(df))
  })
  return(z)}

x=cort(10000,5)
t=rt(100000,5)
plot(density(x),col="blue")
lines(density(t),col="red")


## 10.2 GWAS ----
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)
source("G2P.r")
# In this section, we run the GWAS algorithom twice. The idea and the codes are almost the same.
# The first time, we don't do any complicated indexing, focusing on how GWAS works in general.
# Second time, we will use index to mark our QTN position, and run the same process.


### 10.2.1 first run, general idea ----
# use G2P function, randomly select 10 QTN and simulate phenotype
X=myGD[,-1]
mySim=G2P(X,h2=.75, alpha=1, NQTN=10, distribution="norm")
# with 75% heritability, and assume normally distributed residual, we ramdomly selected 10 QTN's
# and simulated their corresponding phenotype.
str(mySim)

plot(myGM[,c(2,3)])
lines(myGM[mySim$QTN.position,c(2,3)],type="p",col="red")
points(myGM[mySim$QTN.position,c(2,3)],type="p",col="blue",cex = 5)
# use a bubble plot to check the locations of the selected 10 QTN
# note how we get the location: mySim$QTN.position

### r, t, p 
# the idea here is that the statistic t=t(r) follows a t-distribution.
# use such distribution information, we can calculate the probability

# calculate correlation coefficient between y and each col of X 
r=cor(mySim$y,X)
n=nrow(X)
t=r/sqrt((1-r^2)/(n-2)) # see lecture slide 10.5 for this equation
# t is a t-distributed r.v. 
# use the t-distribution calculate the 
p=2*(1-pt(abs(t),n-2))

### Manhattan plot
color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10)
# defining a color array
# note1: all colors have specific names or ID# in R
# note2: also there'are packages deal with colors in R. Please google if interested

m=ncol(X)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
# plot the -log(p) 
# myGM[, 2] is the chromosome column
# color.vector[myGM[,2]] markers on same chromosome get same color
abline(v=mySim$QTN.position, lty = 2, lwd=1, col = "black")
# add an vertical line to mark the QTN position

## 10.2.2 second run: GWAS with known marker location ----
# same thing as 10.2, but when simulate the phenotype, we only 
# use markers on the 1 - 5 chromosome
index1to5=myGM[,2]<6
X1to5 = X[,index1to5]

set.seed(99164)
mySim=G2P(X=X1to5,h2=.75,alpha=1,NQTN=10,distribution="norm")
str(mySim)
# we only sampled markers
plot(myGM[,c(2,3)])
lines(myGM[mySim$QTN.position,c(2,3)],type="p",col="red")
points(myGM[mySim$QTN.position,c(2,3)],type="p",col="blue",cex = 5)

r=cor(mySim$y,X)
n=nrow(X)
t=r/sqrt((1-r^2)/(n-2))
p=2*(1-pt(abs(t),n-2))

color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10)
m=ncol(X)
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=1.5, col = "black")

sort(p)[1:5]
zeros=p==0
p[zeros]=1e-10
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=1.5, col = "black")

## 10.4 GWASbyCor function ----
# Similar as G2P, we can wrap up things we did above into a function:
GWASbyCor=function(X,y){
  n=nrow(X)
  r=cor(y,X)
  n=nrow(X)
  t=r/sqrt((1-r^2)/(n-2))
  p=2*(1-pt(abs(t),n-2))
  zeros=p==0
  p[zeros]=1e-10
  return(p)
}
# X - numerical genotype data
# y - phenotype (numerical)

## 10.5 False positive ----
index=order(p)
top10=index[1:10] # the marker location of most 'significant' markers
detected=intersect(top10,mySim$QTN.position) 
# an intersection of two marker locations set
# top10 is the most significant 10
# mySim$QTN.position is the selected QTN positions for simulation in G2P function
falsePositive=setdiff(top10, mySim$QTN.position)
# falsePositive are those who are identified as 'significant' but are actually 
# not contributing to the phenotype
top10
mySim$QTN.position
detected
length(detected)
falsePositive

plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(v= falsePositive, lty = 2, lwd=2, col = "red")
# same plot as before: manhattan plot with black line on marker location
# add red line to mark false positive locations

p.obs=p[!index1to5] # the P values for markers on 6 -10
m2=length(p.obs)
p.uni=runif(m2,0,1) # null distribution - markers on 6-10 have no extrodinary contribution to the phenotype
order.obs=order(p.obs)
order.uni=order(p.uni)

plot(-log10(p.uni[order.uni]),-log10(p.obs[order.obs]), xlim = c(0,5), ylim=c(0,5)) 
# QQ plot - if things are distributed in a similar way
# If yes, the curve will roughly lie on a 1-1 line
# in our case here, no
abline(a = 0, b = 1, col = "red")

plot(ecdf(-log10(p.obs)))

type1=c(0.01, 0.05, 0.1, 0.2)
cutoff=quantile(p.obs,type1,na.rm=T)
cutoff
plot(type1, cutoff,type="b")