# selection update
###
# this is currently doing worse and noisier than not correcting- e.g. correction 
# by PCA introduces noise. was afraid of.
# NOT USING THIS.
###

countnas = function(matrix) {
  nanum = c()
  for (i in rownames(matrix)) {
    nanum[i] = length(which(is.na(matrix[i,])))
  }
  return(nanum)
}


gc_content = function(str) {
  gc = str_count(str,'G') + str_count(str,'C')
  n=nchar(str)
  return(gc/n)
}

# first run main str_analysis_081216.R script.
strs = read.csv('~/Dropbox/Ath_STRs/str_var/str_masterdata.csv',header=T,stringsAsFactors=FALSE)
strs = cbind(strs,gc_content(strs$Consensus))
colnames(strs)[colnames(strs)=="gc_content(strs$Consensus)"] = 'GC_content'
rownames(strs) = strs$ID

# add MIP arm info
mips = read.csv('~/Dropbox/Ath_STRs/2100_mips_designed.csv',header=T,stringsAsFactors = FALSE)
# have to reparse these...
mip_ids = gsub('([0-9]+)_.+', '\\1', mips$mip_name)
rownames(mips) = mip_ids
ta_gc = rep(NA,nrow(strs))
la_gc = rep(NA,nrow(strs))
scan_gc = rep(NA,nrow(strs))
names(ta_gc) = rownames(strs)
names(la_gc) = rownames(strs)

# get GC contents
ta_gc[mip_ids] = gc_content(mips$ext_probe_sequence)
la_gc[mip_ids] = gc_content(mips$lig_probe_sequence)
scan_gc[mip_ids] = gc_content(mips$scan_target_sequence)

strs = cbind(strs,ta_gc = ta_gc[rownames(strs)],
             la_gc = la_gc[rownames(strs)],
             scan_gc = scan_gc[rownames(strs)])

# some measures of spread
genos[nacount>70,] = rep(NA,96)

logged = na.omit(log10(apply(genos,1,sd,na.rm=TRUE)+.1))
sd_strs = names(logged)
strs = strs[sd_strs,]

annotation = read.table('~/Dropbox/Ath_STRs/str_annots_picked.txt',header=T,sep='\t',stringsAsFactors = FALSE)
annotation[annotation=='HS'] = 'DHS'
rownames(annotation) = annotation$ID  
annotation = annotation[sd_strs,]
sd_genos = genos[sd_strs,]

# see if correcting pop structure does anything?
## THE LMEKIN SOLUTION DOES NOT WORK
load('~/Dropbox/Ath_STRs/Kinmat.Rdat') # becomes 'K'
colnames(K) = colnames(sd_genos) # not matching case etc.
rownames(K) = colnames(sd_genos) # etc.
lmer_corr = function(genos,id,K) {
  require(lmekin)
  dat = na.omit(data.frame(genos=as.numeric(genos),id=id))
  nullmodel = lmekin(genos ~ (1|id), varlist = K, data=dat, method='ML')
  return(residuals(nullmodel))
}

# TRY PCA POP STRUCTURE CORRECTION
lm_corr_logsd_entrop = function(geno,pca,type='logsd') {
  require(infotheo)
  dat = na.omit(data.frame(as.numeric(geno), pca$x[,1:10] ) )
  #print(dat)
  colnames(dat) = c('geno','pc1','pc2','pc3','pc4','pc5','pc6','pc7','pc8','pc9','pc10')  
  mod = lm(geno ~ ., data=dat)
  print(summary(mod))
  resid = residuals(mod)
  plot(dat$geno,resid)
  print(log10(sd(dat$geno)+.1))
   #print(resid)
  if (type=='logsd') { return( log10(sd(resid)+.1) ) } else if (type=='entropy') { return( entropy(resid)  )  } else {return(NA)}
  #return(list('entro' = entr, 'logstdev' = logstdev))
}

logged = apply(sd_genos,1,lm_corr_logsd_entrop,pca=scale_pca,type='logsd')

require(infotheo)
#entrop = apply(sd_genos,1,lm_corr_logsd_entrop,pca=scale_pca,type='entrop')
#entrop = apply(round(sd_genos[sd_strs,]),1,entropy)
#varcor = cor(strs[,'VARscore'],entrop)
#plot(strs[,'VARscore'],entrop,main=paste('VARscore r=',varcor),cex=.4,xlab='VARscore',ylab='STR entropy')


plot(strs[,'VARscore'],logged,ylab='logSD(str)',cex=.4)

# distributions of responses
par(mfrow=c(2,2))
hist(logged,20,xlab='log(SD of STR variation)',main='')
#hist(entrop,20,xlab='Entropy of STR variation',main='')
plot(logged,strs[,'VARscore'],xlab='log(SD of STR variation)',ylab='VARscore',pch='.',main=cor.test(logged,strs[,'VARscore'],na.rm=TRUE)$estimate)
#plot(entrop,strs[,'VARscore'],xlab='Entropy of STR variation',ylab='VARscore',pch='.',main=cor.test(entrop,strs[,'VARscore'],na.rm=TRUE)$estimate)
par(mfrow=c(1,1))

# split by annotation
nc = sd_genos[annotation$annotation %in% c('intergenic','plus/minus_1kb'),]
cod = sd_genos[annotation$annotation %in% c('exon','exon/DHS'),]
dhs = sd_genos[annotation$annotation %in% c('DHS','exon/DHS','DHS/non_gene','intron/DHS'),]
intr = sd_genos[annotation$annotation %in% c('intron','intron/DHS'),]

#cove = cov(cbind(strs$VARscore,entrop))
maha = mahalanobis(cbind(strs$VARscore,logged),cov=cove,center=FALSE)

#cove = cov(na.omit(cbind(strs[rownames(nc),]$VARscore,entrop[rownames(nc)])))
mahan = mahalanobis(na.omit(cbind(strs[rownames(nc),]$VARscore,logged[rownames(nc)])),cov=cove,center=FALSE)

#cove = cov(na.omit(cbind(strs[rownames(cod),]$VARscore,entrop[rownames(cod)])))
mahac = mahalanobis(na.omit(cbind(strs[rownames(cod),]$VARscore,logged[rownames(cod)])),cov=cove,center=FALSE)

#cove = cov(na.omit(cbind(strs[rownames(dhs),]$VARscore,entrop[rownames(dhs)])))
mahad =  mahalanobis(na.omit(cbind(strs[rownames(dhs),]$VARscore,logged[rownames(dhs)])),cov=cove,center=FALSE)

#cove = cov(na.omit(cbind(strs[rownames(intr),]$VARscore,entrop[rownames(intr)])))
mahai = mahalanobis(na.omit(cbind(strs[rownames(intr),]$VARscore,logged[rownames(intr)])),cov=cove,center=FALSE)

# not very interesting....
plot(density(maha),xlim=c(0,40),ylim=c(0,.10),main='mahalanobis distance distributions')
lines(density(mahac),col='red')
lines(density(mahan),col='blue')
lines(density(mahad),col='green')
lines(density(mahai),col='purple')
legend(30,.09,legend=c('All','coding','noncoding','DHS','Intron'),fill=c('black','red','blue','green','purple'))

# try a SVM?? (like legendre) (SVR (regression)- using epsilon regression,
# which is apparently better in situations where you're not too worried about 
# the complexity of the solution: 
# http://stats.stackexchange.com/questions/94118/difference-between-ep-svr-and-nu-svr-and-least-squares-svr)
# i have n>>p so i am ok i think

# mean squared error to evaluate model performance
MSE = function(predicted,actual) {
  sq_err = (predicted-actual)^2
  return(mean(sq_err))
}

svr_fit = function(data,str_ids,response) {
  fit=ksvm(response[str_ids]~.,as.matrix(data[str_ids,]),cross=5,scaled=FALSE)
  return(fit)
}

svr_pred_eval = function(svr, newdata, str_ids,response) {
  pred = predict(svr,newdata=newdata[str_ids,])
  core = cor(response[str_ids], pred)
  mse = MSE(response[str_ids], pred)
  out = list(pred,core,mse)
  names(pred) = str_ids
  names(out)=c('predict','cor','mse')
  return(out)
}

# restrict to numeric predictors, trying various... added multiple gc contents for mips too
#svm_strs=strs[,c("TAIR10start","Unit.Size","Purity","X..of.Units","GC_content","ta_gc","la_gc","scan_gc")]
svm_strs=strs[,c("Unit.Size","Purity","X..of.Units","GC_content","ta_gc","la_gc","scan_gc")]
# worked ok
#svm_strs=strs[,c("TAIR10start","Chromosome","Unit.Size","Purity","X..of.Units","GC_content")]
# also ok
#svm_strs=strs[,c("Chromosome","Unit.Size","Purity","X..of.Units","GC_content")]
# a bit narrower, overall somewhat better fit? (with SVR)
#svm_strs=strs[,c("Unit.Size","Purity","X..of.Units","GC_content")]


#svm_strs$TAIR10start = svm_strs$TAIR10start / chr_lens[strs$Chromosome]

rownames(svm_strs) = names(logged)
# replace reference copy number with median of observed if present
for (i in names(entrop)) {
  #print(i)
  names(svm_strs$X..of.Units) = names(entrop)
  med = median(as.numeric(genos[i,]),na.rm=TRUE)
  #print(med)
  if (is.na(med)) { continue }
  else {
    svm_strs[i,'X..of.Units'] = med
  }
}
#svm_strs=cbind(svm_strs,)
#rownames(svm_strs) = names(entrop)
#rbf = rbfdot(0.5)
#k = kernelMatrix(rbf,)

####
# TRY RIDGE REGRESSION INSTEAD??? TRIES TO MINIMIZE VAR OF SOLUTION... MAYBE GOOD...
# actually didn't seem to work
####

# first do all data to see how it is
tryk = ksvm(logged~.,as.matrix(svm_strs),cross=5,scaled=FALSE)
svm_pred = predict(tryk,svm_strs)
svm_cor = cor(svm_pred,logged)
svm_mse = MSE(svm_pred,logged)
plot(svm_pred,logged,main=paste('SVM r=',svm_cor,'MSE =',svm_mse),cex=.4) # better than varscore

# now try on nc data
nck = ksvm(logged[rownames(nc)]~.,as.matrix(svm_strs[rownames(nc),]),cross=5,scaled=FALSE)
nc_svm_pred = predict(nck,svm_strs[rownames(nc),])

nc_svm_cor = cor(nc_svm_pred,logged[rownames(nc)])
nc_svm_mse = cor(nc_svm_pred,logged[rownames(nc)])

plot(nc_svm_pred,logged[rownames(nc)],main=paste('NC SVM r=',nc_svm_cor),cex=.4,xlab='NC SVR') 

# cv.lm() from DAAG package
# compare to SVR fit just to see
entrop = logged
lm.data = cbind(entrop[rownames(nc)],svm_strs[rownames(nc),])
all_lm.data = cbind(entrop,svm_strs)
colnames(lm.data)[1] = 'entropic'
colnames(all_lm.data)[1] = 'entropic'
fitlm = lm(entropic ~ ., data=lm.data)
fitalllm = lm(entropic ~., data=all_lm.data)
lmallcv= cv.lm(data=all_lm.data,form.lm=fitlm,m=5)
lmcv = cv.lm(data=all_lm.data,form.lm=fitlm,m=5)

# plot the results, PCCs
par(mfrow=c(1,2))
plot(nc_entro_allpred,entrop,cex=.4,main=cor(nc_entro_allpred,entrop),xlab='NC str-trained SVR, 5-fold CV',ylab='Entropy')
plot(lmcv$cvpred,entrop,xlab='NC str-trained LM, 5-fold CV',ylab='Entropy',main=cor(entrop,lmcv$cvpred),cex=.4)
par(mfrow=c(1,1))

# evaluate training vs test - loop through and see what distributions are for nc strs
mses = c()
cors = c()
nc_predic_mat = c()
cod_mses = c()
cod_predic_mat = c()
dhs_predic_mat = c()
intr_predic_mat = c()
ncs = rownames(nc)
train_size = round(nrow(nc) / 2)
#train_size = nrow(svm_strs) / 10
#train_size = 200

# REMEMBER TO SWITCH BACK
# actually, log(SD) seems to work better!! much narrower distributions
# or not??? wtf is going on??
### NEVER SOLVED THIS... unclear what happened that time
entrop = logged

for (i in 1:1000) {
  if (i%%64 == 0) {print(i)}
  print(i)
  #rand_strs = sample(rownames(nc))
  rand_strs = sample(rownames(svm_strs))
  train = rand_strs[1:train_size]
  test = rand_strs[(train_size+1):length(rand_strs)]
  fit = svr_fit(data = svm_strs,response = entrop,str_ids = train)
  predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=ncs,response=entrop)
  mses[i] = predic$mse
  cors[i] = predic$cor
  rownames(predic$predict) = ncs  # apparently this is a matrix...
  nc_predic_mat = cbind(nc_predic_mat, predic$predict[rownames(nc),])
  # coding predictions
  predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(cod),response=entrop)
  cod_mses[i] = predic$mse
  rownames(predic$predict) = rownames(cod)  # apparently this is a matrix...
  cod_predic_mat = cbind(cod_predic_mat,predic$predict[rownames(cod),])
  # dhs predictions
  predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(dhs),response=entrop)
  rownames(predic$predict) = rownames(dhs)
  dhs_predic_mat = cbind(dhs_predic_mat,predic$predict[rownames(dhs),])
  # intron predictions
  predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(intr),response=entrop)
  rownames(predic$predict) = rownames(intr)
  intr_predic_mat = cbind(intr_predic_mat,predic$predict[rownames(intr),])
}

boxplot.matrix(t(nc_predic_mat[1:30,]),las=2)
points(1:30,entrop[rownames(nc)][1:30],pch=19,col='red',cex=.5)

# for nc
ncmeaned = rowMeans(nc_predic_mat)
ncsded = apply(nc_predic_mat,1,sd)
nczscores = (entrop[rownames(nc)]-ncmeaned) / ncsded
hist(nczscores,20)

codmeaned = rowMeans(cod_predic_mat)
codsded = apply(cod_predic_mat,1,sd)
codzscores = (entrop[rownames(cod)]-codmeaned) / codsded
hist(codzscores,20)
codselpos = cbind(codzscores[codzscores>5],strs[names(codzscores[codzscores>5]),])

dhsmeaned = rowMeans(dhs_predic_mat)
dhssded = apply(dhs_predic_mat,1,sd)
dhszscores = (entrop[rownames(dhs)]-dhsmeaned) / dhssded
hist(dhszscores,20)

intrmeaned = rowMeans(intr_predic_mat)
intrsded = apply(intr_predic_mat,1,sd)
intrzscores = (entrop[rownames(intr)]-intrmeaned) / intrsded
hist(intrzscores,20)

plot(density(nczscores),xlab='Z-score',main='based on ensemble prediction',xlim=c(-19,19),ylim=c(0,.25),col='blue')
lines(density(codzscores),col='red')
lines(density(dhszscores),col='green')
lines(density(intrzscores),col='purple')
abline(v=3) #cutoff
abline(v=-3) #cutoff
legend(10,.17,legend=c('coding','noncoding','DHS','Intron'),fill=c('red','blue','green','purple'))

plot(codmeaned,entrop[rownames(cod)],main=cor(codmeaned,entrop[rownames(cod)]))
plot(dhsmeaned,entrop[rownames(dhs)],main=cor(dhsmeaned,entrop[rownames(dhs)]))
plot(ncmeaned,entrop[rownames(nc)],main=cor(ncmeaned,entrop[rownames(nc)]))
plot(intrmeaned,entrop[rownames(intr)],main=cor(intrmeaned,entrop[rownames(intr)]))


# are deviations systematic??? not really...?
#plot(ncmeaned,nczscores,main=cor(nczscores,ncmeaned),xlim=c(-.5,.5),ylim=c(-12,12),cex=.2)
plot(ncmeaned,nczscores,main=cor(nczscores,ncmeaned),xlim=c(-.75,.75),ylim=c(-19,19),cex=.2)
points(intrmeaned,intrzscores,main=cor(intrzscores,intrmeaned),col='green',xlim=c(-.5,.5),ylim=c(-10,10),cex=.2)
points(codmeaned,codzscores,main=cor(codzscores,codmeaned),col='red',xlim=c(-.5,.5),ylim=c(-10,10),cex=.2)
points(dhsmeaned,dhszscores,main=cor(dhszscores,dhsmeaned),col='blue',xlim=c(-.5,.5),ylim=c(-10,10),cex=.2)

#predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids = nc_test,response = entrop)

allzs = append(nczscores, codzscores)
allzs = append(allzs, intrzscores)
allzs = append(allzs, dhszscores)

var_strs = cbind(strs[names(allzs),], allzs)
tabled_var_strs = table(var_strs$Consensus)
var_strs = var_strs[var_strs$Consensus %in% names(which(tabled_var_strs>4)),]
boxplot(var_strs$allzs~var_strs$Consensus, las=2, ylab='Z-scores')
