## crop545 - lab 4
## Imputation

## Y.Song 1/31/2018 

# 000000000000000000000000000000000000000000000
#
#            ----  0. Review ----
#
# 000000000000000000000000000000000000000000000

# to set the working directory.
silver = "/Users/song/Dropbox/COURSE/CROP_545/2018/lab4"
black = 'C:/Users/yuanh/Documents/R'
usb = 'D:/crop545-4'
setwd(usb)


# --- 0.1. importing data -----------------
indexEX = read.csv("./indexEX.csv", header = T) 
head(indexEX)

# --- 0.2. Index/subsampling --------------
indexEX[3, ] # the third row
indexEX[, 3] # the 3rd col
indexEX$U    # 3rd col by name

indexEX[which(indexEX$set=="C"), ] # notice the "=="
indexEX[which(indexEX$order>40 & indexEX$L=="-Inf"), ] # notice the "&"

(indx = c(2,3,4))     # creating index numbers for subsampling 
indexEX[indx,]
head(indexEX[, indx])  # head() to print only the 6 rows of the result

# --- 0.3. For loop --------------

# this is a silly example, just try to get the idea of how the for-loop works
indexEX$read = NA
for (i in 5:nrow(indexEX)) {
  indexEX$read[i]= indexEX$U + 100*i
}
head(indexEX, 10)
# note:
# 1. the index can be called as anything j, k, l, ...z, or 
#    don't even have to be a letter
# 2. in most cases, i is for 1:N. But it can be any number
#    e.g. for(i in c(1,3,5,7))
# 3. you can have multiple indices (j and j and k at the same
#    time). We will discuss that when we got to such senarioes.


# 000000000000000000000000000000000000000000000
#
#    ----  1. stochastic imputation ----
#
# 000000000000000000000000000000000000000000000

# --- 1.1 Import data & data cleaning ---------
myGD=read.table(file="http://zzlab.net/GAPIT/data/mdp_numeric.txt",head=T)
myGD[1:10,1:10]

X.raw=myGD[,-1] # dropping the individual's name
X=X.raw # make an copy of the raw data and working on it
X[1:5, 1:5]

set.seed(545)

# data don't have NA values yet, adding some missing
# values for the data
# ranRow = floor(runif(30, 1, 281))
# ranCol = floor(runif(30, 1, 3093))
# X[ranRow, ranCol] = NA


# --- 1.2 Implete the algorithm --------------------
StochasticImpute = function(X){
  n=nrow(X); m=ncol(X)
  
  # fn - sum of genotypes for all individuals 
  # na.rm - remove NA values
  fn=colSums(X, na.rm=T)  
  
  # count number of non missing individuals
  fc1=colSums(floor(X/3+1),na.rm=T)
  # or equivelently
  fc2 = colSums(!is.na(X))

  fc = fc1
  # Frequency of allele "2"  
  fa=fn/(2*fc) 
  
  # m is the number of col
  for(i in 1:m){
    index.a=runif(n)<fa[i]  
    index.na=is.na(X[,i])
    index.m2=index.a  &  index.na
    index.m0=!index.a  &  index.na
    X[index.m2,i]=2
    X[index.m0,i]=0
  }
  return(X)} 

# 000000000000000000000000000000000000000000000
#
# ----  2. Impletement & evaluate accuracy ----
#
# 000000000000000000000000000000000000000000000


# --- 2.1 Set missing values -----
# target: randomly set 20% of data to NA
# we will generate n*m variables following uniform distribution
(n=nrow(X));(m=ncol(X)); (dp=m*n) 
uv=runif(dp);hist(uv)
range(uv) # 0, 1 continous numerical value

mr=.2 # missing rate (how much percentage of data is NA)
missing=uv<mr # missing = T/F
missing[1:10]
length(missing)

#Format indicator as matrix
index.m=matrix(missing,n,m) 
dim(index.m)

#Set missing values as NA
X[index.m]=NA
X.raw[1:5,1:5]
X[1:5,1:5]

# --- 2.2 Imputation -----
XI= StochasticImpute(X)

# --- 2.3 Accuracy -------

# accuracy estimate 1: correlation coefficient
accuracy.r=cor(X.raw[index.m], XI[index.m])

# accuracy estimate 2: Proportion of match
index.match=X.raw==XI 
# test individual cell value
# if same, TRUE; if not, FALSE

index.mm=index.match&index.m
# index.mm=TRUE: the cell is a assigned as missing value 
# AND its imputated value is the same as original

accuracy.m=length(X[index.mm])/length(X[index.m])

accuracy.r
accuracy.m



# 000000000000000000000000000000000000000000000
#
# ----  3. The 'Impute' Pakcage ----
#
# 000000000000000000000000000000000000000000000


# --- 3.1 loading the package ----
install.packages("impute")
# try http:// if https:// URLs are not supported
# source("https://bioconductor.org/biocLite.R")
# biocLite("impute")
library(impute)

# --- 3.2 imputation by k-nearest neighbor ----

# ?impute.knn()

X.knn= impute.knn(as.matrix(t(X)), k=10)

# --- 3.3 evaluating accuracy----
accuracy.r.si=cor(X.raw[index.m], XI[index.m])
accuracy.r.knn=cor(X.raw[index.m], t(X.knn$data)[index.m])
accuracy.r.si
accuracy.r.knn


## 0000000000000000000000000000000000000000000
##
##                     Beagle
## 
## 0000000000000000000000000000000000000000000



# Convert to BEAGLE input format
index0=X==0 
index1=X==1
index2=X==2
indexna=is.na(X)

X2=X
X2[index0]="A\tA"
X2[index1]="A\tB"
X2[index2]="B\tB"
X2[indexna]="?\t?"

myGD2=cbind("M",myGD[,1],X2)

write.table(myGD2,file="./test.bgl",
            quote=F,sep="\t",col.name=F,row.name=F)

#Impute with BEAGLE
system("java -jar beagle.jar D:/crop545-4/beagle.jar 
       unphased=test.bgl missing=? out=test1" )


#Convert output format
genotype.full <- read.delim("test1.test.bgl.phased.gz",sep=" ",head=T)
genotype.c=as.matrix(genotype.full[,-(1:2)])
index.A=genotype.c=="A"
index.B=genotype.c=="B"
nr=nrow(genotype.c)
nc=ncol(genotype.c)
genotype.n=matrix(0,nr,nc)
genotype.n[index.A]=0
genotype.n[index.B]=1
n2=ncol(genotype.n)
odd=seq(1,n2-1,2)
even=seq(2,n2,2)
g0=genotype.n[,odd]
g1=genotype.n[,even]
X.bgl=g0+g1


#Impute and calculate correlation
accuracy.r=cor(X.raw[index.m], X.bgl[index.m])
index.match=X.raw==X.bgl
index.mm=index.match&index.m
accuracy.m=length(X[index.mm])/length(X[index.m])
accuracy.r
accuracy.m