# analyze new STR genotype data
# first pass 8/11/16
library(vegan)
library(kernlab)
library(stringr)
library(DAAG)
set.seed(311)

num_strs = 2050
# according to TAIR10 golden path
#chr_lens = c(30427671,19698289,23459830,18585056,26975502)
pseudocount = .5
selthresh = 3
boot_divisor = 4

countnas = function(matrix) {
  nanum = c()
  for (i in rownames(matrix)) {
    nanum[i] = length(which(is.na(matrix[i,])))
  }
  return(nanum)
}


gc_content = function(str) {
  gc = str_count(str,'G') + str_count(str,'C')
  n=nchar(str)
  return(gc/n)
  }

genos = read.table('~/Dropbox/Ath_STRs/problem_mip_genotypes_081016.txt',header=T)
info = read.csv('~/Dropbox/Ath_STRs/mip_lib_info_081116.csv',header=T,stringsAsFactors = FALSE)
annots = read.csv('~/Dropbox/Ath_STRs/str_annots.csv',header=T)
lib_names = gsub('_R1_001.fastq.gz','',info[1:96,'spikein_file'])
lib_names = gsub('-','.',lib_names)
info = info[1:96,]
rownames(info) = lib_names
info = info[colnames(genos),]
colnames(genos) = info$Strain
geography = info[,'Region.1001genomes.']
regions = unique(geography)
geog_colors = c('red','blue','black','dark green','purple','orange','hot pink')
#geog_colors= rainbow(length(regions))
names(geog_colors) = regions

# work around known crappiness
genos[genos<0] = NA   # this happens when impossible lengths are found


# really basic analysis... how many calls?
count_calls = function(gen_tab) {
  return(length(which(!(is.na(genos)))))
  }

count_alleles = function(gen_tab,numcols,sum=FALSE) {
  tablature = apply(gen_tab[,1:numcols,drop=FALSE],1,table)
  lengthed = simplify2array(lapply(tablature,length))
  if (sum) {
  return(sum(lengthed))
  } else {return(lengthed)}
  }

rarefy_count_alleles = function(gen_tab) { 
  gen_tab=gen_tab[,sample(colnames(gen_tab))]
  rared = sapply(1:96,count_alleles,gen_tab = gen_tab, sum=TRUE)
  return(rared)
  }

all_calls = count_calls(genos)

cat('num genotype calls total:',all_calls,'\n','proportion called:',all_calls/(num_strs*96),'\n')

# estimate proportion of calls overall by strain
strain_ascertain = 1 - (countnas(t(genos)) / num_strs)
hist(strain_ascertain,20,xlab='Proportion STR genotypes called per strain',main='')

# by str
nacount = countnas(genos)
hist(96-as.numeric(nacount),20,xlab='Number of genotype calls per STR (of 96)',main='')

# how many alleles per repeat?? 
#tablature = apply(genos,1,table)
#lengthed = simplify2array(lapply(tablature,length))
lengthed = count_alleles(genos,96)
hist(lengthed,25,xlab='Number of alleles / STR',main='')
cat('average number of alleles:',mean(lengthed),'median:',median(lengthed),'\n')
cat(length(which(lengthed==0)),'strs have no genotype calls in any strain','\n')

# # rarefaction of alleles??
# # takes a while- uncomment if desire to rerun
#  num_rars = 10
#  num_strains = ncol(genos)
#  rar_mat = matrix(rep(NA,num_rars*num_strains),num_rars)
#  plot(1,type='n',xlim=c(0,100),ylim=c(1000,sum(lengthed)), las=1, ylab='Number of alleles', xlab='Number of strains sampled')
# # 
#  for (i in 1:num_rars) {
#    cat('rarefaction subsample',i,'\n')
#    rar_mat[i,] = rarefy_count_alleles(genos)
#    lines(1:96,rar_mat[i,])
#    }

# plot distribution of most common allele frequency
major_allele_freq = function(x) {
  x = unlist(x)
  if ( length(which(is.na(x))) > 86 ) {return(NA)}
  commonest = sort(table(na.omit(x)),decreasing=TRUE)[1]
  freq = commonest / length(na.omit(x))
  return(freq)
  }

# look at allele freq spectrum (roughly... for lots of info see Haasl and Payseur 2010 MBE)
major_allele_freqs = apply(genos,1,major_allele_freq)
plot(density(na.omit(major_allele_freqs),bw=.01),xlab='Major STR allele frequency',main='',xlim=c(0,1))
lessthan50percent = length(which(na.omit(major_allele_freqs)<=.5))
monoallelic = length(which(major_allele_freqs==1))
total = length(na.omit(major_allele_freqs))
cat(lessthan50percent/total*100,'% of STRs have major allele frequencies <=50%\n',sep='')
cat(monoallelic/total*100,'% of STRs are monoallelic\n',sep='')

# Haasl and Payseur 2010 state that an estimate of theta (scaled mu = 4Ne*mu) is 
# theta = (1 / (8*mean(x)^2)) - 1/2 (Eq. 11), where mean(x) is the mean frequency of alleles at a given locus
# pretty straightforward i think? though the constants may change w/ n (pop sample size)... should email em.
# esp. because my sample size itself varies a bit!!!
theta_hat = function(x) {
  x = unlist(x)
  if ( length(which(is.na(x))) > 71 ) {return(NA)}
  allele_counts = sort(table(na.omit(x)),decreasing=TRUE)
  if (length(allele_counts) < 2) {return(NA)}
  freqs = allele_counts / length(na.omit(x))
  xbar2 = mean(freqs)^2
  thetahat = (1 / (8*xbar2)) - 1/2
  return(thetahat)
}
thetaed = apply(genos,1,theta_hat)

# plot the estimator... some really high values there!!
plot(density(na.omit(thetaed)),xlab='Theta_hat_xbar',main='')

# see if weirdness is introduced by sample size. a few values are below zero (!!!) but no obvious sample size reason for that
annots = annots[rownames(genos),]
cored = cor.test(96-nacount, thetaed,na.rm=TRUE,xlim=c(0,40),xlab='# units in Col',ylab='Theta_hat')$estimate
plot(96-nacount,thetaed,pch='.',xlab='n',ylab='theta_hat_xbar',main=cored)

# look for correlations with other things that would predict var
# make sure that you've run the Ath_MIPs_FigS1_design.R script or version thereof to populate
# env with annots and other stuff!!
cored = cor.test(annots$X..of.Units, thetaed,na.rm=TRUE,xlim=c(0,40),xlab='# units in Col',ylab='Theta_hat')$estimate
plot(annots$X..of.Units, thetaed,xlim=c(5,40),xlab='# units in Col',ylab='Theta_hat',pch='.',main=cored)
cored = cor.test(annots$Unit.Size, thetaed,na.rm=TRUE,xlim=c(0,40),xlab='# units in Col',ylab='Theta_hat')$estimate
boxplot(thetaed~annots$Unit.Size,xlab='Unit size',ylab='Theta_hat',pch='.',main=cored)
cored = cor.test(annots$Purity, thetaed,na.rm=TRUE,xlim=c(0,40),xlab='# units in Col',ylab='Theta_hat')$estimate
boxplot(thetaed~annots$Purity,xlab='% Purity',ylab='Theta_hat',pch='.',main=cored)

par(mfrow=c(2,3))
high_theta = names(na.omit(thetaed[thetaed>60]))

for (thet in high_theta) {
  hist(as.numeric(genos[thet,]),30,xlab=paste('STR',thet,'unit number'),main='')
  }
par(mfrow=c(1,1))

# various ordination approaches
# nmds- from vegan package. scaling to avoid weirdness from highly var reps- necessary?
scale_dists = dist(scale(t(genos)))
dists = dist(t(genos))
nmds_scale = metaMDS(scale_dists)   # that looks really weird... no convergence # now fixed?
plot(nmds_scale,type='n')
text(nmds_scale,labels=rownames(nmds_scale$points))

nmds = metaMDS(dists)
plot(nmds,type='n')   # VERY DIFFERENT!! 
text(nmds,labels=rownames(nmds$points)) # different, better, but poor sep

pca = prcomp(t(na.omit(genos)))
plot(pca$x[,1],pca$x[,2],type='n')  # not bad, but not great
text(pca$x[,1],pca$x[,2],labels=colnames(genos))  

var_genos = t(na.omit(genos))
vars = apply(var_genos,2,var)
var_genos = var_genos[,vars>0]
scale_pca = prcomp(var_genos,scale=T)
cat('first PCA eigs:',scale_pca$sdev[1:10]/sum(scale_pca$sdev),'\n')
plot(scale_pca$x[,1],scale_pca$x[,2],type='n',xlab='PC1',ylab='PC2',main='PCA')  # not bad, but not great
text(scale_pca$x[,1],scale_pca$x[,2],labels=info[1:96,'Strain'],cex=.7,col = geog_colors[geography],srt=45)
legend(12,10,legend=regions,fill=geog_colors,cex=.7)

plot(scale_pca$x[,2],scale_pca$x[,3],type='n',xlab='PC2',ylab='PC3',main='PCA')  # not bad, but not great
text(scale_pca$x[,2],scale_pca$x[,3],labels=info[1:96,'Strain'],cex=.5,col = geog_colors[geography],srt=45)
legend(10,20,legend=regions,fill=geog_colors,cex=.7)

plot(scale_pca$x[,1],scale_pca$x[,2],cex=.7,col = geog_colors[geography],xlab='PC1',ylab='PC2',main='PCA',pch=19)  # not bad, but not great
#text(scale_pca$x[,1],scale_pca$x[,2],labels=colnames(genos),cex=.5)
legend(12,10,legend=regions,fill=geog_colors,cex=.7)
# PC3 shows cvi/mr-0 craziness well.

# classical mds - pretty similar, emphasizes outliers as opposed to fine structure
cmd = cmdscale(dists,k=2,eig=TRUE)
plot(cmd$points,type='n',xlab='MDS1',ylab='MDS2',main='Classical MDS')
text(cmd$points,labels=info[1:96,'Strain'],cex=.7,col = geog_colors[geography],srt=45)
legend(-40,40,legend=regions,fill=geog_colors,cex=.7)
cat('first MDS eigs:',cmd$eig[1:10]/sum(cmd$eig),'\n')

# ok! now selection analysis.
strs = read.csv('~/Dropbox/Ath_STRs/str_var/str_masterdata.csv',header=T,stringsAsFactors=FALSE)
strs = cbind(strs,gc_content(strs$Consensus))
colnames(strs)[colnames(strs)=="gc_content(strs$Consensus)"] = 'GC_content'
#strs = strs[strs$Keisha_moniker!='' & strs$Keisha_moniker!='1',]
#pat = '([A-Z]+_[0-9]+)_[A-Za-z_0-9 \\*]+'
rownames(strs) = strs$ID

# add MIP arm info
mips = read.csv('~/Dropbox/Ath_STRs/2100_mips_designed.csv',header=T,stringsAsFactors = FALSE)
# have to reparse these...
mip_ids = gsub('([0-9]+)_.+', '\\1', mips$mip_name)
rownames(mips) = mip_ids
ta_gc = rep(NA,nrow(strs))
la_gc = rep(NA,nrow(strs))
scan_gc = rep(NA,nrow(strs))
names(ta_gc) = rownames(strs)
names(la_gc) = rownames(strs)

# get GC contents
ta_gc[mip_ids] = gc_content(mips$ext_probe_sequence)
la_gc[mip_ids] = gc_content(mips$lig_probe_sequence)
scan_gc[mip_ids] = gc_content(mips$scan_target_sequence)

strs = cbind(strs,ta_gc = ta_gc[rownames(strs)],
             la_gc = la_gc[rownames(strs)],
             scan_gc = scan_gc[rownames(strs)])
       

# check against ref, report numbers
bound = na.omit(cbind(as.numeric(genos[as.character(strs$ID),'Col']),strs$X..of.Units))
accuracy = length(which(bound[,1]==bound[,2])) / nrow(bound)
cat('in Col, typed',length(na.omit(genos[,'Col'])),'of 2050 strs, among which', accuracy, 'accuracy\n')

# some measures of spread
genos[nacount>70,] = rep(NA,96)


logged = na.omit(log10(apply(genos,1,sd,na.rm=TRUE)+pseudocount))
sd_strs = names(logged)
strs = strs[sd_strs,]

#stop('stop here to avoid running selection analysis')
#plot()

#annotation = read.csv('~/Dropbox/Ath_STRs/2100_final_picked_STRs_021816.csv',header=T,stringsAsFactors = FALSE,encoding='latin1')[sd_strs,]
#annotation = read.table('~/Dropbox/Ath_STRs/str_annots_picked.txt',header=T,sep='\t',stringsAsFactors = FALSE)
#annotation[annotation=='HS'] = 'DHS'
annotation = read.delim('~/Dropbox/Ath_STRs/araport_annot/Ath_STRs_full_annotations_111616.tsv',header=T)
rownames(annotation) = annotation$ID  
annotation = annotation[sd_strs,]
sd_genos = genos[sd_strs,]

# see if correcting pop structure does anything?
## THE LMEKIN SOLUTION DOES NOT WORK
load('~/Dropbox/Ath_STRs/Kinmat.Rdat') # becomes 'K'
colnames(K) = colnames(sd_genos) # not matching case etc.
rownames(K) = colnames(sd_genos) # etc.
lmer_corr = function(genos,id,K) {
  require(lmekin)
  dat = na.omit(data.frame(genos=as.numeric(genos),id=id))
  nullmodel = lmekin(genos ~ (1|id), varlist = K, data=dat, method='ML')
  return(residuals(nullmodel))
  }

# TRY PCA POP STRUCTURE CORRECTION
lm_corr_logsd_entrop = function(geno,pca,type='logsd') {
  require(infotheo)
  dat = na.omit(data.frame(as.numeric(geno), pca$x[,1:10] ) )
 #print(dat)
  colnames(dat) = c('geno','pc1','pc2','pc3','pc4','pc5','pc6','pc7','pc8','pc9','pc10')  
  mod = lm(geno ~ ., data=dat)
  resid = residuals(mod)
  if (type=='logsd') { return( log10(sd(resid)+.05) ) } else if (type=='entropy') { return( entropy(resid)  )  } else {return(NA)}
  #return(list('entro' = entr, 'logstdev' = logstdev))
  }


# split by annotation
nc = sd_genos[annotation$annotation %in% c('intergenic','plus/minus_1kb'),]
cod = sd_genos[annotation$annotation %in% c('exon','exon/DHS'),]
dhs = sd_genos[annotation$annotation %in% c('DHS','exon/DHS','DHS/non_gene','intron/DHS'),]
intr = sd_genos[annotation$annotation %in% c('intron','intron/DHS'),]

plot(strs[,'VARscore'],logged,ylab='logSD(str)',cex=.4)

require(infotheo)
entrop = apply(round(sd_genos[sd_strs,]),1,entropy)
varcor = cor(strs[,'VARscore'],entrop)
plot(strs[,'VARscore'],entrop,main=paste('VARscore r=',varcor),cex=.4,xlab='VARscore',ylab='STR entropy')

# distributions of responses
par(mfrow=c(2,2))
hist(logged,20,xlab='log(SD of STR variation)',main='')
hist(entrop,20,xlab='Entropy of STR variation',main='')
plot(logged,strs[,'VARscore'],xlab='log(SD of STR variation)',ylab='VARscore',pch='.',main=cor.test(logged,strs[,'VARscore'],na.rm=TRUE)$estimate)
plot(entrop,strs[,'VARscore'],xlab='Entropy of STR variation',ylab='VARscore',pch='.',main=cor.test(entrop,strs[,'VARscore'],na.rm=TRUE)$estimate)
par(mfrow=c(1,1))

cove = cov(cbind(strs$VARscore,entrop))
maha = mahalanobis(cbind(strs$VARscore,logged),cov=cove,center=FALSE)

#cove = cov(na.omit(cbind(strs[rownames(nc),]$VARscore,entrop[rownames(nc)])))
mahan = mahalanobis(na.omit(cbind(strs[rownames(nc),]$VARscore,logged[rownames(nc)])),cov=cove,center=FALSE)

#cove = cov(na.omit(cbind(strs[rownames(cod),]$VARscore,entrop[rownames(cod)])))
mahac = mahalanobis(na.omit(cbind(strs[rownames(cod),]$VARscore,logged[rownames(cod)])),cov=cove,center=FALSE)

#cove = cov(na.omit(cbind(strs[rownames(dhs),]$VARscore,entrop[rownames(dhs)])))
mahad =  mahalanobis(na.omit(cbind(strs[rownames(dhs),]$VARscore,logged[rownames(dhs)])),cov=cove,center=FALSE)

#cove = cov(na.omit(cbind(strs[rownames(intr),]$VARscore,entrop[rownames(intr)])))
mahai = mahalanobis(na.omit(cbind(strs[rownames(intr),]$VARscore,logged[rownames(intr)])),cov=cove,center=FALSE)

# not very interesting....
plot(density(maha),xlim=c(0,40),ylim=c(0,.10),main='mahalanobis distance distributions')
lines(density(mahac),col='red')
lines(density(mahan),col='blue')
lines(density(mahad),col='green')
lines(density(mahai),col='purple')
legend(30,.09,legend=c('All','coding','noncoding','DHS','Intron'),fill=c('black','red','blue','green','purple'))

# try a SVM?? (like legendre) (SVR (regression)- using epsilon regression,
# which is apparently better in situations where you're not too worried about 
# the complexity of the solution: 
# http://stats.stackexchange.com/questions/94118/difference-between-ep-svr-and-nu-svr-and-least-squares-svr)
# i have n>>p so i am ok i think

# mean squared error to evaluate model performance
MSE = function(predicted,actual) {
  sq_err = (predicted-actual)^2
  return(mean(sq_err))
  }

svr_fit = function(data,str_ids,response) {
  fit=ksvm(response[str_ids]~.,as.matrix(data[str_ids,]),cross=5,scaled=FALSE)
  return(fit)
  }

svr_pred_eval = function(svr, newdata, str_ids,response) {
  pred = predict(svr,newdata=newdata[str_ids,])
  core = cor(response[str_ids], pred)
  mse = MSE(response[str_ids], pred)
  out = list(pred,core,mse)
  names(pred) = str_ids
  names(out)=c('predict','cor','mse')
  return(out)
  }

# restrict to numeric predictors, trying various... added multiple gc contents for mips too
#svm_strs=strs[,c("TAIR10start","Unit.Size","Purity","X..of.Units","GC_content","ta_gc","la_gc","scan_gc")]
svm_strs=strs[,c("Unit.Size","Purity","X..of.Units","GC_content","ta_gc","la_gc","scan_gc")]
# worked ok
#svm_strs=strs[,c("TAIR10start","Chromosome","Unit.Size","Purity","X..of.Units","GC_content")]
# also ok
#svm_strs=strs[,c("Chromosome","Unit.Size","Purity","X..of.Units","GC_content")]
# a bit narrower, overall somewhat better fit? (with SVR)
#svm_strs=strs[,c("Unit.Size","Purity","X..of.Units","GC_content")]


#svm_strs$TAIR10start = svm_strs$TAIR10start / chr_lens[strs$Chromosome]

rownames(svm_strs) = names(entrop)
# replace reference copy number with median of observed if present
for (i in names(entrop)) {
  #print(i)
  names(svm_strs$X..of.Units) = names(entrop)
  med = median(as.numeric(genos[i,]),na.rm=TRUE)
  #print(med)
  if (is.na(med)) { continue }
  else {
  svm_strs[i,'X..of.Units'] = med
  }
}

# first do all data to see how it is
tryk = ksvm(logged~.,as.matrix(svm_strs),cross=5,scaled=FALSE)
svm_pred = predict(tryk,svm_strs)
svm_cor = cor(svm_pred,logged)
svm_mse = MSE(svm_pred,logged)
plot(svm_pred,logged,main=paste('SVM r=',svm_cor,'MSE =',svm_mse),cex=.4) # better than varscore

# entropy better?
entrok = ksvm(entrop~.,as.matrix(svm_strs),cross=5,scaled=FALSE)
entro_pred = predict(entrok,svm_strs)
names(entro_pred) = names(entrop)
entro_cor = cor(entro_pred,entrop)
entro_mse = MSE(entro_pred,entrop)
plot(entro_pred,entrop,xlab='SVR prediction',ylab='entropy',main=paste('SVM r=',entro_cor,'MSE =',entro_mse),cex=.4) # better than log-SD?

####
# try mahalanobis analysis again, with SVR prediction instead of just varscore
cove = cov(cbind(entro_pred,entrop))
maha = mahalanobis(cbind(entro_pred,entrop),cov=cove,center=FALSE)
mahan = mahalanobis(na.omit(cbind(entro_pred[rownames(nc)],entrop[rownames(nc)])),cov=cove,center=FALSE)
mahac = mahalanobis(na.omit(cbind(entro_pred[rownames(cod)],entrop[rownames(cod)])),cov=cove,center=FALSE)
mahad =  mahalanobis(na.omit(cbind(entro_pred[rownames(dhs)],entrop[rownames(dhs)])),cov=cove,center=FALSE)
mahai = mahalanobis(na.omit(cbind(entro_pred[rownames(intr)],entrop[rownames(intr)])),cov=cove,center=FALSE)

# not very interesting....
plot(density(maha),xlim=c(0,35),ylim=c(0,.2),main='mahalanobis distance distributions')
lines(density(mahac),col='red')
lines(density(mahan),col='blue')
lines(density(mahad),col='green')
lines(density(mahai),col='purple')
legend(30,.09,legend=c('All','coding','noncoding','DHS','Intron'),fill=c('black','red','blue','green','purple'))

# now try on nc data
ncs = rownames(nc)
nck = ksvm(logged[rownames(nc)]~.,as.matrix(svm_strs[rownames(nc),]),cross=5,scaled=FALSE)
nck_entro = ksvm(entrop[rownames(nc)]~.,as.matrix(svm_strs[rownames(nc),]),cross=5,scaled=FALSE)
nc_svm_pred = predict(nck,svm_strs[rownames(nc),])
nc_svm_pred_entro = predict(nck_entro,svm_strs[rownames(nc),])

nc_svm_cor = cor(nc_svm_pred,logged[rownames(nc)])
nc_svm_entro_cor = cor(nc_svm_pred_entro,entrop[rownames(nc)])
nc_svm_mse = cor(nc_svm_pred,logged[rownames(nc)])
nc_svm_entro_mse = MSE(nc_svm_pred_entro,entrop[rownames(nc)])

plot(nc_svm_pred,logged[rownames(nc)],main=paste('NC SVM r=',nc_svm_cor),cex=.4,xlab='NC SVR') # better than log-SD?
plot(nc_svm_pred_entro,entrop[rownames(nc)],main=paste('NC SVM r=',nc_svm_entro_cor),cex=.4,xlab='NC SVR') # better than log-SD?

# moving forward with entropy, somewhat better justification...
cod_entro_ncpred = predict(nck_entro,svm_strs[rownames(cod),])

cod_entro_svm = ksvm(entrop[rownames(cod)]~., as.matrix(svm_strs[rownames(cod),]),cross=5,scaled=FALSE)
cod_entro_codpred = predict(cod_entro_svm,svm_strs[rownames(cod),])
cod_entro_codpred_cor = cor(cod_entro_codpred,entrop[rownames(cod)])
plot(cod_entro_codpred,entrop[rownames(cod)],cex=.4,main=cod_entro_codpred_cor)

nc_entro_codpred = predict(nck_entro,svm_strs[rownames(nc),])
plot(nc_entro_codpred,entrop[rownames(nc)],cex=.4,main=cor(nc_entro_codpred,entrop[rownames(nc)]))

nc_entro_allpred = predict(nck_entro,svm_strs)
nc_log_allpred = predict(nck,svm_strs)
names(nc_log_allpred) = rownames(svm_strs)
plot(nc_entro_allpred,entrop,cex=.4,main=cor(nc_entro_allpred,entrop),xlab='NC str-trained SVR, 5-fold CV',ylab='Entropy')
plot(nc_log_allpred,logged,cex=.4,main=cor(nc_log_allpred,logged),xlab='NC str-trained SVR, 5-fold CV',ylab='Log(SD)')


# cv.lm() from DAAG package
# compare to SVR fit just to see
entrop = logged
lm.data = cbind(entrop[rownames(nc)],svm_strs[rownames(nc),])
all_lm.data = cbind(entrop,svm_strs)
colnames(lm.data)[1] = 'entropic'
colnames(all_lm.data)[1] = 'entropic'
fitlm = lm(entropic ~ ., data=lm.data)
fitalllm = lm(entropic ~., data=all_lm.data)
# commented because they make big irritating plots
#lmallcv= cv.lm(data=all_lm.data,form.lm=fitlm,m=5)
lmcv = cv.lm(data=all_lm.data,form.lm=fitlm,m=5)

# plot the results, PCCs
par(mfrow=c(1,2))
plot(nc_log_allpred,logged,cex=.4,main=cor(nc_log_allpred,logged),xlab='NC str-trained SVR, 5-fold CV',ylab='Log(SD)')
points(nc_log_allpred[ncs],logged[ncs],cex=.4,col='black',pch=19)
points(nc_log_allpred[rownames(dhs)],logged[rownames(dhs)],cex=.4,col='blue',pch=19)
points(nc_log_allpred[rownames(intr)],logged[rownames(intr)],cex=.4,col='green',pch=19)
points(nc_log_allpred[rownames(cod)],logged[rownames(cod)],cex=.4,col='red',pch=19)
plot(lmcv$cvpred,entrop,xlab='NC str-trained LM, 5-fold CV',ylab='Log(SD)',main=cor(logged,lmcv$cvpred),cex=.4)

print(cor.test(nc_log_allpred[ncs],logged[ncs]))
print(cor.test(nc_log_allpred[rownames(dhs)],logged[rownames(dhs)]))
print(cor.test(nc_log_allpred[rownames(intr)],logged[rownames(intr)]))
print(cor.test(nc_log_allpred[rownames(cod)],logged[rownames(cod)]))

par(mfrow=c(1,1))

# evaluate training vs test - loop through and see what distributions are for nc strs
mses = c()
cors = c()
nc_predic_mat = c()
cod_mses = c()
cod_predic_mat = c()
dhs_predic_mat = c()
intr_predic_mat = c()
train_size = round(nrow(nc) / boot_divisor)
#train_size = round(nrow(nc) / 2)
#train_size = nrow(svm_strs) / 10
#train_size = 200

# REMEMBER TO SWITCH BACK
# actually, log(SD) seems to work better!! much narrower distributions
# or not??? wtf is going on??
### NEVER SOLVED THIS... unclear what happened that time
entrop = logged

for (i in 1:1000) {
if (i%%64 == 0) {print(i)}
  print(i)
#rand_strs = sample(rownames(nc))
rand_strs = sample(rownames(svm_strs))
train = rand_strs[1:train_size]
test = rand_strs[(train_size+1):length(rand_strs)]
fit = svr_fit(data = svm_strs,response = entrop,str_ids = train)
predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=ncs,response=entrop)
mses[i] = predic$mse
cors[i] = predic$cor
rownames(predic$predict) = ncs  # apparently this is a matrix...
nc_predic_mat = cbind(nc_predic_mat, predic$predict[rownames(nc),])
# coding predictions
predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(cod),response=entrop)
cod_mses[i] = predic$mse
rownames(predic$predict) = rownames(cod)  # apparently this is a matrix...
cod_predic_mat = cbind(cod_predic_mat,predic$predict[rownames(cod),])
# dhs predictions
predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(dhs),response=entrop)
rownames(predic$predict) = rownames(dhs)
dhs_predic_mat = cbind(dhs_predic_mat,predic$predict[rownames(dhs),])
# intron predictions
predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(intr),response=entrop)
rownames(predic$predict) = rownames(intr)
intr_predic_mat = cbind(intr_predic_mat,predic$predict[rownames(intr),])
}

boxplot.matrix(t(nc_predic_mat[1:30,]),las=2)
points(1:30,entrop[rownames(nc)][1:30],pch=19,col='red',cex=.5)

# for nc
ncmeaned = rowMeans(nc_predic_mat)
ncsded = apply(nc_predic_mat,1,sd)
nczscores = (entrop[rownames(nc)]-ncmeaned) / ncsded
hist(nczscores,20)

codmeaned = rowMeans(cod_predic_mat)
codsded = apply(cod_predic_mat,1,sd)
codzscores = (entrop[rownames(cod)]-codmeaned) / codsded
hist(codzscores,20)
codselneg = cbind(codzscores[codzscores<(-selthresh)],strs[names(codzscores[codzscores<(-selthresh)]),])
codselpos = cbind(codzscores[codzscores>selthresh],strs[names(codzscores[codzscores>selthresh]),])

dhsmeaned = rowMeans(dhs_predic_mat)
dhssded = apply(dhs_predic_mat,1,sd)
dhszscores = (entrop[rownames(dhs)]-dhsmeaned) / dhssded
hist(dhszscores,20)

intrmeaned = rowMeans(intr_predic_mat)
intrsded = apply(intr_predic_mat,1,sd)
intrzscores = (entrop[rownames(intr)]-intrmeaned) / intrsded
hist(intrzscores,20)

plot(density(nczscores),xlab='Z-score',main='based on ensemble prediction',xlim=c(-14,20),ylim=c(0,.25),col='black')
lines(density(codzscores),col='red')
lines(density(dhszscores),col='blue')
lines(density(intrzscores),col='green')
abline(v=selthresh) #cutoff
abline(v=-selthresh) #cutoff
legend(10,.17,legend=c('coding','noncoding','DHS','Intron'),fill=c('red','black','blue','green'))

plot(codmeaned,entrop[rownames(cod)],main=cor(codmeaned,entrop[rownames(cod)]))
plot(dhsmeaned,entrop[rownames(dhs)],main=cor(dhsmeaned,entrop[rownames(dhs)]))
plot(ncmeaned,entrop[rownames(nc)],main=cor(ncmeaned,entrop[rownames(nc)]))
plot(intrmeaned,entrop[rownames(intr)],main=cor(intrmeaned,entrop[rownames(intr)]))


# are deviations systematic??? not really...?
#plot(ncmeaned,nczscores,main=cor(nczscores,ncmeaned),xlim=c(-.5,.5),ylim=c(-12,12),cex=.2)
plot(ncmeaned,nczscores,main=cor(nczscores,ncmeaned),xlim=c(-.2,.75),ylim=c(-19,25),cex=.2)
points(intrmeaned,intrzscores,main=cor(intrzscores,intrmeaned),col='green',xlim=c(-.5,.5),ylim=c(-10,10),cex=.2)
points(codmeaned,codzscores,main=cor(codzscores,codmeaned),col='red',xlim=c(-.5,.5),ylim=c(-10,10),cex=.2)
points(dhsmeaned,dhszscores,main=cor(dhszscores,dhsmeaned),col='blue',xlim=c(-.5,.5),ylim=c(-10,10),cex=.2)

#predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids = nc_test,response = entrop)

allzs = append(nczscores, codzscores)
allzs = append(allzs, intrzscores)
allzs = append(allzs, dhszscores)

var_strs = cbind(strs[names(allzs),], allzs)
tabled_var_strs = table(var_strs$Consensus)
var_strs = var_strs[var_strs$Consensus %in% names(which(tabled_var_strs>4)),]
boxplot(var_strs$allzs~var_strs$Consensus, las=2, ylab='Z-scores')

# find str expansions (relative)
# takes str allele distribution as input
find_expansions = function(matrix,str_id) {
  distr = na.omit(as.numeric(matrix[str_id,]))
  med = median(distr)
  stdev = sd(distr)
  maxim = max(distr)
  return(((maxim-med)/med))
}

max_med= c()
for (str in rownames(genos)) {
  max_med[str] = find_expansions(genos,str )
}

hist(max_med,40,xlab='Expansion score\n(max unit number / median unit number)',main='')
expanded = na.omit(max_med[max_med>2])  #totally arbitrary

# ntm1 intronic str was interesting (expansion score ~ 6):
hist(as.numeric(genos['65400',]),20,main='',xlab='NTM1 intron STR copy number',xlim=c(0,20))

