#########################################################################
######### HW3 ######################

# set working directory
setwd("C:/Users/stephanie.sjoberg/Dropbox/2018 Spring Classes/Stat Gen/HW3")
set.seed(99164)
#####	Question 1   ######

# load genotype data
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)

# load genetic map
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)


#Sampling QTN
NQTN=10
X=myGD[,-1]
n=nrow(X); m=ncol(X)
QTN.position=sample(m,NQTN,replace=F)
SNPQ=as.matrix(X[,QTN.position])
QTN.position
head(SNPQ)

# plot QTN
plot(myGM[,c(2,3)])
lines(myGM[QTN.position,c(2,3)],type="p",col="red")
points(myGM[QTN.position,c(2,3)],type="p",col="blue",cex = 5)
dev.copy(png,'QTNpositions.png')
dev.off()



# randomly simulate some additive effect.
# recall the add effect follows a normal distribution
addeffect=rnorm(NQTN,0,1)
effect=SNPQ%*%addeffect
# %*% stands for matrix multiplication
head(effect)

# descriptive plot of the effect
par(mfrow=c(2,2))
plot(effect, main="Scatter plot of effect")
hist(effect)
boxplot(effect, main="Boxplot of effect")
plot(ecdf(effect), main="Cumulative density of effect")
dev.copy(png,'DistGenEffect.png')
dev.off()


# assume an fixed h-sqaure score, and figure residual variance var(e)
h2=.75
effectvar=var(effect)
effectvar
residualvar=(effectvar-h2*effectvar)/h2 
# see the lecture for this equation
residualvar
residual=rnorm(n,0,sqrt(residualvar))
head(residual)

par(mfrow=c(2,2))
plot(residual,main="Scatter plot of residual")
hist(residual)
boxplot(residual, main="Boxplot of residual")
plot(ecdf(residual), main="Cumulative density of residual")
dev.copy(png,'DistResidual.png')
dev.off()

y=effect+residual
par(mfrow=c(2,2))
plot(y,main="Scatter plot of simulated phenotype")
hist(y,main="Histogram of simulated phenotype")
boxplot(y,main="Boxplot of simulated phenotype")
plot(ecdf(y),main="Cumulative density of simulated phenotype")
dev.copy(png,'Phenotype.png')
dev.off()


# How much of variation in the phenotypic trait was due to genetic 
# variation
va=var(effect)
ve=var(residual)
vp=var(y)
v=matrix(c(va,ve,vp),1,3)
colnames(v)=c("A", "E","P") # assign names to columns

par(mfrow=c(1,1))
plot(density(y),ylim=c(0,.25), main="Density of Y, G, and e")
lines(density(effect),col="blue")
lines(density(residual),col="red")
polygon(density(effect), col="forestgreen")
polygon(density(residual), col="grey")
polygon(density(y), col="deepskyblue")
legend("topright", legend = c("Phenotype","Genetic effect","Residual"),fill=c("deepskyblue","forestgreen","grey"),bty="n" )



dev.copy(png,'Variances1.png')
dev.off()
barplot(v,col="gray")
dev.copy(png,'Variances2.png')
dev.off()


# Correlations between y and effects
# Plot
par(mfrow=c(1,3))
plot(y,effect)
plot(y,residual)
plot(residual,effect)
cor(y,effect)
dev.copy(png,'CorXY.png')
dev.off()



##  Question 2  ##


### 10.2.1 first run, general idea ----
# use G2P function, randomly select 10 QTN and simulate phenotype

G2P=function(X,h2,alpha,NQTN,distribution){
  n=nrow(X)
  m=ncol(X)
  #Sampling QTN
  QTN.position=sample(m,NQTN,replace=F)
  SNPQ=as.matrix(X[,QTN.position])
  QTN.position
  #QTN effects
  if(distribution=="norm")
  {addeffect=rnorm(NQTN,0,1)
  }else
  {addeffect=alpha^(1:NQTN)}
  #Simulate phenotype
  effect=SNPQ%*%addeffect
  effectvar=var(effect)
  residualvar=(effectvar-h2*effectvar)/h2
  residual=rnorm(n,0,sqrt(residualvar))
  y=effect+residual
  return(list(addeffect = addeffect, 
              y=y, add = effect, 
              residual = residual, 
              QTN.position=QTN.position, 
              SNPQ=SNPQ))
}

X=myGD[,-1]
mySim=G2P(X,h2=.75, alpha=1, NQTN=10, distribution="norm")
# with 75% heritability, and assume normally distributed residual, we ramdomly selected 10 QTN's
# and simulated their corresponding phenotype.
str(mySim)

### r, t, p 
# the idea here is that the statistic t=t(r) follows a t-distribution.
# use such distribution information, we can calculate the probability

# calculate correlation coefficient between y and each col of X 
r=cor(mySim$y,X)
n=nrow(X)
t=r/sqrt((1-r^2)/(n-2)) # see lecture slide 10.5 for this equation
# t is a t-distributed r.v. 
# use the t-distribution calculate the 
p=2*(1-pt(abs(t),n-2))

### Manhattan plot
color.vector <- rep(c("deepskyblue","orange","forestgreen","indianred3"),10)
# defining a color array
# note1: all colors have specific names or ID# in R
# note2: also there'are packages deal with colors in R. Please google if interested

m=ncol(X)
par(mfrow=c(1,1))
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
# plot the -log(p) 
# myGM[, 2] is the chromosome column
# color.vector[myGM[,2]] markers on same chromosome get same color
abline(v=mySim$QTN.position, lty = 2, lwd=1, col = "black")
# add an vertical line to mark the QTN position
dev.copy(png,'ManhattanPlot.png')
dev.off()



##  Question 3  ##

# top 10 SNPs
index=order(p)
top10=index[1:10]
detected=intersect(top10,mySim$QTN.position)
falsePositive=setdiff(top10, mySim$QTN.position)
top10
mySim$QTN.position
detected
length(detected)
falsePositive
plot(t(-log10(p))~seq(1:m),col=color.vector[myGM[,2]])
abline(v=mySim$QTN.position, lty = 2, lwd=2, col = "black")
abline(v= falsePositive, lty = 2, lwd=2, col = "red")
dev.copy(png,'Top10SNP.png')
dev.off()


##  Question 4  ##

#sort the p values with increasing order
output_1to7= sort(p[mySim$QTN.position],decreasing = F)[1:7] 
#the seventh significant p value
seventh_p=output_1to7[7]
seventh_p # 0.06087149
# count the number of QTN whose p value is less than the seventh_p
length(p[p<seventh_p]) #  517

##  Question 5  ##

source("GWASbyCor.R")
nrep=100
myrep_Q5=replicate (nrep,{
  mySim=G2P(X,h2=.75,alpha=1,NQTN=10,distribution="normal") #phenotype simulation
  p= as.vector(GWASbyCor(X=X,y=mySim$y))#GWAS by using the correlation methods
  
  index=order(p) #order the P value from the smallest to the biggest
  top10=index[1:10]
  detected=intersect(top10,mySim$QTN.position)#find the position of top10 detected QTN
  
  num_detected= length(detected) # the number of QTNs among top ten associated SNPs
  
  output_1to7= sort(p[mySim$QTN.position],decreasing = F)[1:7]
  seventh_p= output_1to7[7]
  num_smaller_p= length(p[p<seventh_p])
  
  num_output=c()
  num_output=c(num_detected,num_smaller_p)#combine the output into matrix
  
})
print(rowMeans(myrep_Q5))
# 5.16 (# QTN in top 10 p-value SNPs)  211.59 (# QTN with p-value lower than 7th highest)
print(apply(myrep_Q5,1,sd))
# 1.253037 103.858304





##  Question 6 Extra Credit ##

rm(list=ls())
set.seed(99164)

G2P=function(X,h2,alpha,NQTN,distribution){
  n=nrow(X)
  m=ncol(X)
  #Sampling QTN
  QTN.position=sample(m,NQTN,replace=F)
  SNPQ=as.matrix(X[,QTN.position])
  QTN.position
  #QTN effects
  if(distribution=="normal")
  {addeffect=rnorm(NQTN,0,1)
  }else
  {addeffect=alpha^(1:NQTN)}
  
  #Simulate phenotype
  effect=SNPQ%*%addeffect
  effectvar=var(effect)
  residualvar=(effectvar-h2*effectvar)/h2
  residual=rnorm(n,0,sqrt(residualvar))
  y=effect+residual
  return(list(addeffect = addeffect, y=y, add = effect, residual = residual, QTN.position=QTN.position, SNPQ=SNPQ))
  
}

# load genotype data
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)

# load genetic map
myGM=read.table(file="http://zzlab.net/GAPIT/data/mdp_SNP_information.txt",head=T)

X=myGD[,-1]

##set parameters for G2P function
h2=0.75
NQTN=10
alpha=0.2 # used as parameter to calculate quantile for distribution

mySim=G2P(X,0.75,0.2,10,"geom") ##set the genetic effect of the first QTN as 0.3


#plot the heritability###
#explore the relationship through density plot
plot(density(mySim$y),lwd=1,main="", xlim=c(-0.3,0.7),ylim=c(0,12)) #plot the distribution of phenotypic value of 281 individuals 
lines(density(mySim$add),col="black",lwd=1) #plot the proportion of genetic effect of phenotypic value 
lines(density(mySim$residual),col="black",lwd=1) #plot the rest proportion of phenotypic value that can not be explained called residual
polygon(density(mySim$add), col="forestgreen")
polygon(density(mySim$residual), col="grey")
polygon(density(mySim$y), col="deepskyblue")
legend("topright", legend = c("Phenotype","Genetic effect","Residual"),fill=c("deepskyblue","forestgreen","grey"),bty="n" )
dev.copy(png,'VariancesSkewed.png')
dev.off()




#plot the correlation between y,add and residual
par(mfrow=c(1,3))

plot(mySim$y,mySim$add,xlab="Phenotype",ylab="Genetic effect")
legend(x='topleft', legend=paste('r =',round(cor(mySim$y, mySim$add),2)))#add pearson correlation coefficient

plot(mySim$y,mySim$residual,xlab="Phenotype",ylab="Residual")
legend(x='topleft', legend=paste('r =',round(cor(mySim$y,mySim$residual),2)))#add pearson correlation coefficient

plot(mySim$add,mySim$residual,xlab="Genetic effect",ylab="Residual")
legend(x='topleft', legend=paste('r =',round(cor(mySim$add,mySim$residual),2)))#add pearson correlation coefficient

dev.copy(png,'CorSkewed.png')
dev.off()