# analyze new STR genotype data
# refactored 3/25/17
# updated substantially 12/01/16
# first pass 8/11/16
library(vegan)
library(kernlab)
library(stringr)
library(DAAG)
library(beeswarm)
set.seed(311) # remember the 90s?

# to make rmarkdown work
if(isTRUE(getOption('knitr.in.progress'))) { 
  setwd('..')} else {
  setwd('~/Dropbox/Ath_STRs/')
  }

num_strs = 2050
pseudocount = .5
boot_divisor = 4 # controls size of bootstrapped samples for SVR selection inference

countnas = function(matrix) {
  nanum = c()
  for (i in rownames(matrix)) {
    nanum[i] = length(which(is.na(matrix[i,])))
  }
  return(nanum)
}

gc_content = function(str) {
  gc = str_count(str,'G') + str_count(str,'C')
  n=nchar(str)
  return(gc/n)
  }

genos = read.table('problem_mip_genotypes_081016.txt',header=T)
info = read.csv('mip_design_troubleshoot/mip_lib_info_081116.csv',header=T,stringsAsFactors = FALSE)
#annots = read.csv('~/Dropbox/Ath_STRs/araport_annots/str_annots.csv',header=T)
annotation = read.delim('araport_annot/Ath_STRs_full_annotations_052317.tsv',header=T,sep='\t')
rownames(annotation) = annotation$ID  
lib_names = gsub('_R1_001.fastq.gz','',info[1:96,'spikein_file'])
lib_names = gsub('-','.',lib_names)
info = info[1:96,]
rownames(info) = lib_names
info = info[colnames(genos),]
colnames(genos) = info$Strain
geography = info[,'Region.1001genomes.']
regions = unique(geography)
geog_colors = c('red','blue','black','dark green','purple','orange','hot pink')
names(geog_colors) = regions

# work around known crappiness
# probably not expansions because i checked a few of these and they are largely just garbage
genos[genos<0] = NA   # this happens when 'impossible' lengths are found- expansions?

all_genos = genos

# really basic analysis... how many calls?
count_calls = function(gen_tab) {
  return(length(which(!(is.na(genos)))))
  }

count_alleles = function(gen_tab,numcols,sum=FALSE) {
  tablature = apply(gen_tab[,1:numcols,drop=FALSE],1,table)
  lengthed = simplify2array(lapply(tablature,length))
  lengthed[lengthed==0] = NA # all genotypes missing
  if (sum) {
  return(sum(lengthed))
  } else {return(lengthed)}
  }

rarefy_count_alleles = function(gen_tab) { 
  gen_tab=gen_tab[,sample(colnames(gen_tab))]
  rared = sapply(1:96,count_alleles,gen_tab = gen_tab, sum=TRUE)
  return(rared)
  }

all_calls = count_calls(genos)

cat('num genotype calls total:',all_calls,'\n','proportion called:',all_calls/(num_strs*96),'\n')

# estimate proportion of calls overall by strain
strain_ascertain = 1 - (countnas(t(genos)) / num_strs)
hist(strain_ascertain,20,xlab='Proportion STR genotypes called per strain',main='')

# by str
nacount = countnas(genos)
hist(96-as.numeric(nacount),20,xlab='Number of genotype calls per STR (of 96)',main='')

# how many alleles per repeat?? 
lengthed = count_alleles(genos,96)
hist(lengthed,25,xlab='Number of alleles / STR', main='', col='black', right=FALSE)
cat('average number of alleles:',mean(lengthed, na.rm=TRUE),'median:',median(lengthed, na.rm=TRUE),'\n')
cat(length(which(is.na(lengthed))),'strs have no genotype calls in any strain','\n')

# by functional annotation:
par(mfrow=c(2,2))
hist(lengthed[annotation[names(lengthed),'annotation'] =='coding'], 25,
     main='Coding', xlab = 'Number of alleles / STR', xlim = c(0,26), col='black')
hist(lengthed[annotation[names(lengthed),'annotation'] =='UTR'], 25,
     main='UTR', xlab = 'Number of alleles / STR', xlim = c(0,26), col='black')
hist(lengthed[annotation[names(lengthed),'annotation'] =='intron'], 25,
     main='Intron', xlab = 'Number of alleles / STR', xlim = c(0,26), col='black')
hist(lengthed[annotation[names(lengthed),'annotation'] =='Intergenic'], 25,
     main='Intergenic', xlab = 'Number of alleles / STR', xlim = c(0,26), col='black')



# composition of very badly ascertained STRs
unascertained = annotation[names(which(nacount>90)),]
cat('Motif counts among STRs with low ascertainment (5 or fewer genotypes):\n')
motifed = sort(table(unascertained$Consensus))
print(motifed[motifed>0])

# # rarefaction of alleles??
# # takes a while- uncomment if desire to rerun
#  num_rars = 10
#  num_strains = ncol(genos)
#  rar_mat = matrix(rep(NA,num_rars*num_strains),num_rars)
#  plot(1,type='n',xlim=c(0,100),ylim=c(1000,sum(lengthed)), las=1, ylab='Number of alleles', xlab='Number of strains sampled')
# # 
#  for (i in 1:num_rars) {
#    cat('rarefaction subsample',i,'\n')
#    rar_mat[i,] = rarefy_count_alleles(genos)
#    lines(1:96,rar_mat[i,])
#    }

# plot distribution of most common allele frequency
major_allele_freq = function(x) {
  x = unlist(x)
  if ( length(which(is.na(x))) > 86 ) {return(NA)}
  commonest = sort(table(na.omit(x)),decreasing=TRUE)[1]
  freq = commonest / length(na.omit(x))
  return(freq)
  }

# look at allele freq spectrum (roughly... for lots of info see Haasl and Payseur 2010 MBE)
major_allele_freqs = apply(genos,1,major_allele_freq)
plot(density(na.omit(major_allele_freqs),bw=.01),xlab='Major STR allele frequency',main='',xlim=c(0,1))
lessthan50percent = length(which(na.omit(major_allele_freqs)<=.5))
monoallelic = length(which(major_allele_freqs==1))
total = length(na.omit(major_allele_freqs))
cat(lessthan50percent/total*100,'% of STRs have major allele frequencies <=50%\n',sep='')
cat(monoallelic/total*100,'% of STRs are monoallelic\n',sep='')

# Haasl and Payseur 2010 state that an estimate of theta (scaled mu = 4Ne*mu) is 
# theta = (1 / (8*mean(x)^2)) - 1/2 (Eq. 11), where mean(x) is the mean frequency of alleles at a given locus
# pretty straightforward i think? though the constants may change w/ n (pop sample size)... should email em.
# esp. because my sample size itself varies a bit!!!
theta_hat = function(x) {
  x = unlist(x)
  if ( length(which(is.na(x))) > 71 ) {return(NA)}
  allele_counts = sort(table(na.omit(x)),decreasing=TRUE)
  if (length(allele_counts) < 2) {return(NA)}
  freqs = allele_counts / length(na.omit(x))
  xbar2 = mean(freqs)^2
  thetahat = (1 / (8*xbar2)) - 1/2
  return(thetahat)
}
thetaed = apply(genos,1,theta_hat)

# plot the estimator... some really high values there!!
plot(density(na.omit(thetaed)),xlab='Theta_hat_xbar',main='')

# see if weirdness is introduced by sample size. a few values are below zero (!!!) but no obvious sample size reason for that
annots = annotation[rownames(genos),]
cored = cor.test(96-nacount, thetaed,na.rm=TRUE,xlim=c(0,40),xlab='# units in Col',ylab='Theta_hat')$estimate
plot(96-nacount,thetaed,pch='.',xlab='n',ylab='theta_hat_xbar',main=cored)

# look for correlations with other things that would predict var
# make sure that you've run the Ath_MIPs_FigS1_design.R script or version thereof to populate
# env with annots and other stuff!!
cored = cor.test(annots$X..of.Units, thetaed,na.rm=TRUE,xlim=c(0,40),xlab='# units in Col',ylab='Theta_hat')$estimate
plot(annots$X..of.Units, thetaed,xlim=c(5,40),xlab='# units in Col',ylab='Theta_hat',pch='.',main=cored)
cored = cor.test(annots$Unit.Size, thetaed,na.rm=TRUE,xlim=c(0,40),xlab='# units in Col',ylab='Theta_hat')$estimate
boxplot(thetaed~annots$Unit.Size,xlab='Unit size',ylab='Theta_hat',pch='.',main=cored)
cored = cor.test(annots$Purity, thetaed,na.rm=TRUE,xlim=c(0,40),xlab='# units in Col',ylab='Theta_hat')$estimate
boxplot(thetaed~annots$Purity,xlab='% Purity',ylab='Theta_hat',pch='.',main=cored)

par(mfrow=c(2,3))
high_theta = names(na.omit(thetaed[thetaed>60]))

for (thet in high_theta) {
  hist(as.numeric(genos[thet,]),30,xlab=paste('STR',thet,'unit number'),main='')
  }
par(mfrow=c(1,1))

# various ordination approaches
# nmds- from vegan package. scaling to avoid weirdness from highly var reps- necessary?
scale_dists = dist(scale(t(genos)))
dists = dist(t(genos))
nmds_scale = metaMDS(scale_dists)   # that looks really weird... no convergence # now fixed?
plot(nmds_scale,type='n')
text(nmds_scale,labels=rownames(nmds_scale$points))

nmds = metaMDS(dists)
plot(nmds,type='n')   # VERY DIFFERENT!! 
text(nmds,labels=rownames(nmds$points)) # different, better, but poor sep

pca = prcomp(t(na.omit(genos)))
plot(pca$x[,1],pca$x[,2],type='n')  # not bad, but not great
text(pca$x[,1],pca$x[,2],labels=colnames(genos))  

var_genos = t(na.omit(genos))
vars = apply(var_genos,2,var)
var_genos = var_genos[,vars>0]
scale_pca = prcomp(var_genos,scale=T)
cat('first PCA eigs:',scale_pca$sdev[1:10]^2/sum(scale_pca$sdev^2),'\n')
plot(scale_pca$x[,1],scale_pca$x[,2],type='n',xlab='PC1',ylab='PC2',main='PCA')  
text(scale_pca$x[,1],scale_pca$x[,2],labels=info[1:96,'Strain'],cex=.7,col = geog_colors[geography],srt=45)
legend(12,10,legend=regions,fill=geog_colors,cex=.7)

plot(scale_pca$x[,2],scale_pca$x[,3],type='n',xlab='PC2',ylab='PC3',main='PCA')  
text(scale_pca$x[,2],scale_pca$x[,3],labels=info[1:96,'Strain'],cex=.5,col = geog_colors[geography],srt=45)
legend(10,20,legend=regions,fill=geog_colors,cex=.7)

plot(scale_pca$x[,1],scale_pca$x[,2],cex=.7,col = geog_colors[geography],xlab='PC1',ylab='PC2',main='PCA',pch=19)  
#text(scale_pca$x[,1],scale_pca$x[,2],labels=colnames(genos),cex=.5)
legend(12,10,legend=regions,fill=geog_colors,cex=.7)
# PC3 shows cvi/mr-0 craziness well.



# ok! now selection analysis.
strs = read.csv('~/Dropbox/Ath_STRs/mip_design_troubleshoot/str_masterdata.csv',header=T,stringsAsFactors=FALSE)
strs = cbind(strs,gc_content(strs$Consensus))
colnames(strs)[colnames(strs)=="gc_content(strs$Consensus)"] = 'GC_content'
#strs = strs[strs$Keisha_moniker!='' & strs$Keisha_moniker!='1',]
#pat = '([A-Z]+_[0-9]+)_[A-Za-z_0-9 \\*]+'
rownames(strs) = strs$ID

# add MIP arm info
mips = read.csv('mip_design_troubleshoot/2100_mips_designed.csv',header=T,stringsAsFactors = FALSE)
# have to reparse these...
mip_ids = gsub('([0-9]+)_.+', '\\1', mips$mip_name)
rownames(mips) = mip_ids
ta_gc = rep(NA,nrow(strs))
la_gc = rep(NA,nrow(strs))
scan_gc = rep(NA,nrow(strs))
names(ta_gc) = rownames(strs)
names(la_gc) = rownames(strs)

# get GC contents
ta_gc[mip_ids] = gc_content(mips$ext_probe_sequence)
la_gc[mip_ids] = gc_content(mips$lig_probe_sequence)
scan_gc[mip_ids] = gc_content(mips$scan_target_sequence)

strs = cbind(strs,ta_gc = ta_gc[rownames(strs)],
             la_gc = la_gc[rownames(strs)],
             scan_gc = scan_gc[rownames(strs)])
       

# check against ref, report numbers
bound = na.omit(cbind(as.numeric(genos[as.character(strs$ID),'Col']),strs$X..of.Units))
accuracy = length(which(bound[,1]==bound[,2])) / nrow(bound)
cat('in Col, typed',length(na.omit(genos[,'Col'])),'of 2050 strs, among which', accuracy, 'accuracy\n')

# some measures of spread
genos[nacount>=70,] = rep(NA,96)

logged = na.omit(log10(apply(genos,1,sd,na.rm=TRUE)+pseudocount))
sd_strs = names(logged)
strs = strs[sd_strs,]

annotation = read.delim('~/Dropbox/Ath_STRs/araport_annot/Ath_STRs_full_annotations_052317.tsv',header=T)
rownames(annotation) = annotation$ID  
annotation = annotation[sd_strs,]
sd_genos = genos[sd_strs,]

novar = rep('variable',length(sd_strs))
novar[logged == log10(pseudocount)] = 'invariant'
novar = cbind(novar,as.character(annotation[names(logged),'annotation']))
print('distribution of invariant STRs across annotations')
print(table(as.data.frame(novar)))
novar[novar[,2]!='coding',2] = 'noncoding'
print('test association of coding with invariant STRs')
print(fisher.test(table(as.data.frame(novar))))

# split by annotation
nc = sd_genos[annotation$annotation == 'Intergenic',]
cod = sd_genos[annotation$annotation == 'coding',]
utr = sd_genos[annotation$annotation == 'UTR',]
intr = sd_genos[annotation$annotation == 'intron',]

print(table(annotation$annotation))

plot(strs[,'VARscore'],logged,ylab='logSD(str)',cex=.4)

require(infotheo)
entrop = apply(round(sd_genos[sd_strs,]),1,entropy)
varcor = cor(strs[,'VARscore'],entrop)
plot(strs[,'VARscore'],entrop,main=paste('VARscore r=',varcor),cex=.4,xlab='VARscore',ylab='STR entropy')

# distributions of responses
par(mfrow=c(2,2))
hist(logged,20,xlab='log(SD of STR variation)',main='')
hist(entrop,20,xlab='Entropy of STR variation',main='')
plot(logged,strs[,'VARscore'],xlab='log(SD of STR variation)',ylab='VARscore',pch='.',main=cor.test(logged,strs[,'VARscore'],na.rm=TRUE)$estimate)
plot(entrop,strs[,'VARscore'],xlab='Entropy of STR variation',ylab='VARscore',pch='.',main=cor.test(entrop,strs[,'VARscore'],na.rm=TRUE)$estimate)
par(mfrow=c(1,1))

# try a SVM?? (like legendre) (SVR (regression)- using epsilon regression,
# which is apparently better in situations where you're not too worried about 
# the complexity of the solution: 
# http://stats.stackexchange.com/questions/94118/difference-between-ep-svr-and-nu-svr-and-least-squares-svr)
# i have n>>p so i am ok i think

# mean squared error to evaluate model performance
MSE = function(predicted,actual) {
  sq_err = (predicted-actual)^2
  return(mean(sq_err))
  }

svr_fit = function(data,str_ids,response) {
#  fit=ksvm(response[str_ids]~.,as.matrix(data[str_ids,]),cross=5,scaled=FALSE)
  fit=ksvm(response[str_ids]~.,as.matrix(data[str_ids,]),scaled=FALSE)
  return(fit)
  }

svr_pred_eval = function(svr, newdata, str_ids,response) {
  pred = predict(svr,newdata=newdata[str_ids,])
  core = cor(response[str_ids], pred)
  mse = MSE(response[str_ids], pred)
  out = list(pred,core,mse)
  names(pred) = str_ids
  names(out)=c('predict','cor','mse')
  return(out)
  }

# restrict to numeric predictors, trying various... added multiple gc contents for mips too
#svm_strs=strs[,c("TAIR10start","Unit.Size","Purity","X..of.Units","GC_content","ta_gc","la_gc","scan_gc")]
#svm_strs=strs[,c("Unit.Size","Purity","X..of.Units","GC_content","ta_gc","la_gc","scan_gc")]
svm_strs=strs[,c("Unit.Size","Purity","X..of.Units","GC_content","scan_gc")]
# worked ok
#svm_strs=strs[,c("TAIR10start","Chromosome","Unit.Size","Purity","X..of.Units","GC_content")]
# also ok
#svm_strs=strs[,c("Chromosome","Unit.Size","Purity","X..of.Units","GC_content")]
# a bit narrower, overall somewhat better fit? (with SVR)
#svm_strs=strs[,c("Unit.Size","Purity","X..of.Units","GC_content")]

#svm_strs$TAIR10start = svm_strs$TAIR10start / chr_lens[strs$Chromosome]

rownames(svm_strs) = names(entrop)
# replace reference copy number with median of observed if present
for (i in names(entrop)) {
  names(svm_strs$X..of.Units) = names(entrop)
  med = median(as.numeric(genos[i,]),na.rm=TRUE)
  if (is.na(med)) { continue }
  else {
  svm_strs[i,'X..of.Units'] = med
  }
}

# first do all data to see how it is
tryk = ksvm(logged~.,as.matrix(svm_strs),cross=5,scaled=FALSE)
svm_pred = predict(tryk,svm_strs)
names(svm_pred) = rownames(svm_strs)
svm_cor = cor(svm_pred,logged)
svm_mse = MSE(svm_pred,logged)
ncs = rownames(nc)

# plot from this everything model
# informative?
# plot(svm_pred[rownames(nc)],logged[rownames(nc)],main=paste('SVM r=',svm_cor,'MSE =',svm_mse),cex=.4,pch=19) # better than varscore
# points(svm_pred[rownames(cod)],logged[rownames(cod)], cex=.4, pch=19, col='red')
# points(svm_pred[rownames(intr)],logged[rownames(intr)], cex=.4, pch=19, col='green')
# points(svm_pred[rownames(utr)],logged[rownames(utr)], cex=.4, pch=19, col='blue')
# abline(0,1)

nc_mse = MSE(svm_pred[rownames(nc)],logged[rownames(nc)])
cod_mse = MSE(svm_pred[rownames(cod)],logged[rownames(cod)])
intr_mse = MSE(svm_pred[rownames(intr)],logged[rownames(intr)])
utr_mse = MSE(svm_pred[rownames(utr)],logged[rownames(utr)])

print(cor.test(svm_pred[ncs],logged[ncs]))
print(cor.test(svm_pred[rownames(utr)],logged[rownames(utr)]))
print(cor.test(svm_pred[rownames(intr)],logged[rownames(intr)]))
print(cor.test(svm_pred[rownames(cod)],logged[rownames(cod)]))

# entropy better?
entrok = ksvm(entrop~.,as.matrix(svm_strs),cross=5,scaled=FALSE)
entro_pred = predict(entrok,svm_strs)
names(entro_pred) = names(entrop)
entro_cor = cor(entro_pred,entrop)
entro_mse = MSE(entro_pred,entrop)
plot(entro_pred,entrop,xlab='SVR prediction',ylab='entropy',main=paste('SVM r=',entro_cor,'MSE =',entro_mse),cex=.4) # better than log-SD?

# now try on nc data
ncs = rownames(nc)
nck = ksvm(logged[rownames(nc)]~.,as.matrix(svm_strs[rownames(nc),]),cross=5,scaled=FALSE)
nck_entro = ksvm(entrop[rownames(nc)]~.,as.matrix(svm_strs[rownames(nc),]),cross=5,scaled=FALSE)
nc_svm_pred = predict(nck,svm_strs[rownames(nc),])
nc_svm_pred_entro = predict(nck_entro,svm_strs[rownames(nc),])

nc_svm_cor = cor(nc_svm_pred,logged[rownames(nc)])
nc_svm_entro_cor = cor(nc_svm_pred_entro,entrop[rownames(nc)])
nc_svm_mse = cor(nc_svm_pred,logged[rownames(nc)])
nc_svm_entro_mse = MSE(nc_svm_pred_entro,entrop[rownames(nc)])

plot(nc_svm_pred,logged[rownames(nc)],main=paste('NC SVM r=',nc_svm_cor),cex=.4,xlab='NC SVR') # better than log-SD?
plot(nc_svm_pred_entro,entrop[rownames(nc)],main=paste('NC SVM r=',nc_svm_entro_cor),cex=.4,xlab='NC SVR') # better than log-SD?

# moving forward with entropy, somewhat better justification...
cod_entro_ncpred = predict(nck_entro,svm_strs[rownames(cod),])

cod_entro_svm = ksvm(entrop[rownames(cod)]~., as.matrix(svm_strs[rownames(cod),]),cross=5,scaled=FALSE)
cod_entro_codpred = predict(cod_entro_svm,svm_strs[rownames(cod),])
cod_entro_codpred_cor = cor(cod_entro_codpred,entrop[rownames(cod)])
plot(cod_entro_codpred,entrop[rownames(cod)],cex=.4,main=cod_entro_codpred_cor)

nc_entro_codpred = predict(nck_entro,svm_strs[rownames(nc),])
plot(nc_entro_codpred,entrop[rownames(nc)],cex=.4,main=cor(nc_entro_codpred,entrop[rownames(nc)]))

nc_entro_allpred = predict(nck_entro,svm_strs)
nc_log_allpred = predict(nck,svm_strs)
names(nc_log_allpred) = rownames(svm_strs)
plot(nc_entro_allpred,entrop,cex=.4,main=cor(nc_entro_allpred,entrop),xlab='NC str-trained SVR, 5-fold CV',ylab='Entropy')
plot(nc_log_allpred,logged,cex=.4,main=cor(nc_log_allpred,logged),xlab='NC str-trained SVR, 5-fold CV',ylab='Log(SD)')

# cv.lm() from DAAG package
# compare to SVR fit just to see
entrop = logged
lm.data = cbind(entrop[rownames(nc)],svm_strs[rownames(nc),])
all_lm.data = cbind(entrop,svm_strs)
colnames(lm.data)[1] = 'entropic'
colnames(all_lm.data)[1] = 'entropic'
rownames(all_lm.data) = rownames(svm_strs)
rownames(lm.data) = rownames(nc)
fitlm = lm(entropic ~ ., data=lm.data)
fitalllm = lm(entropic ~., data=all_lm.data)
# commented because they make big irritating plots
#lmallcv= cv.lm(data=all_lm.data,form.lm=fitlm,m=5)
# not printing or plotting breaks fn? weird. should be fixed.
# plotting is less disruptive with fitlm (one ugly plot rather than pages and pages of printout)
lmcv = cv.lm(data=all_lm.data,form.lm=fitlm,m=5,plot=TRUE, printit=FALSE) #, printit=FALSE) 
#lmcv = cv.lm(data=all_lm.data,form.lm=fitalllm,m=5,plot=TRUE, printit=FALSE) #, printit=FALSE) 

# plot the results, PCCs
par(mfrow=c(1,2))
plot(nc_log_allpred[ncs],logged[ncs],cex=.4,ylim=c(-.3,1),pch=19,col='black', xlab='NC str-trained SVR prediction, 5-fold CV',ylab='Log(SD)')
points(nc_log_allpred[rownames(utr)],logged[rownames(utr)],cex=.4,col='blue',pch=19)
points(nc_log_allpred[rownames(intr)],logged[rownames(intr)],cex=.4,col='green',pch=19)
points(nc_log_allpred[rownames(cod)],logged[rownames(cod)],cex=.4,col='red',pch=19)
abline(0,1)

plot(lmcv[ncs,'cvpred'],logged[ncs],ylim=c(-.3,1),pch=19,xlab='NC str-trained LM prediction, 5-fold CV',ylab='Log(SD)', cex=.4)
points(lmcv[rownames(cod),'cvpred'],logged[rownames(cod)],cex=.4,col='red',pch=19)
points(lmcv[rownames(utr),'cvpred'],logged[rownames(utr)],cex=.4,col='blue',pch=19)
points(lmcv[rownames(intr),'cvpred'],logged[rownames(intr)],cex=.4,col='green',pch=19)
abline(0,1)

print('PCCs for CV SVR mod by genomic location')
print(cor.test(nc_log_allpred[ncs],logged[ncs]))
print(cor.test(nc_log_allpred[rownames(utr)],logged[rownames(utr)]))
print(cor.test(nc_log_allpred[rownames(intr)],logged[rownames(intr)]))
print(cor.test(nc_log_allpred[rownames(cod)],logged[rownames(cod)]))

# lm pccs
print('PCCs for CV LM mod by genomic location')
print(cor.test(lmcv[ncs,'cvpred'],logged[ncs]))
print(cor.test(lmcv[rownames(cod),'cvpred'],logged[rownames(cod)]))
print(cor.test(lmcv[rownames(intr),'cvpred'],logged[rownames(intr)]))
print(cor.test(lmcv[rownames(utr),'cvpred'],logged[rownames(utr)]))
par(mfrow=c(1,1))

# evaluate training vs test - loop through and see what distributions are for nc strs
mses = c()
cors = c()
nc_predic_mat = c()
cod_mses = c()
cod_predic_mat = c()
utr_predic_mat = c()
intr_predic_mat = c()
train_size = round(nrow(nc) / boot_divisor)

####### THIS IS THE SELECTION ANALYSIS THAT I ACTUALLY USED
# I KNOW THAT IT LOOKS TERRIBLE DON'T JUDGE ME
# in any case, USING LOG(SD) from now on. because i coded it as entropy everywhere, just doing this jankety thing:
entrop = logged

for (i in 1:1000) {
if (i%%64 == 0) {print(i)}
rand_strs = sample(rownames(nc),replace=TRUE)
train = rand_strs[1:train_size]
test = rand_strs[(train_size+1):length(rand_strs)]
fit = svr_fit(data = svm_strs,response = entrop,str_ids = train)
predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=ncs,response=entrop)
mses[i] = predic$mse
cors[i] = predic$cor
rownames(predic$predict) = ncs  # apparently this is a matrix...
nc_predic_mat = cbind(nc_predic_mat, predic$predict[rownames(nc),])
# coding predictions -  should have just wrapped in a fn
predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(cod),response=entrop)
cod_mses[i] = predic$mse
rownames(predic$predict) = rownames(cod)  # apparently this is a matrix...
cod_predic_mat = cbind(cod_predic_mat,predic$predict[rownames(cod),])
# utr predictions
predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(utr),response=entrop)
rownames(predic$predict) = rownames(utr)
utr_predic_mat = cbind(utr_predic_mat,predic$predict[rownames(utr),])
utr_mse[i] = predic$mse
# intron predictions
predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(intr),response=entrop)
rownames(predic$predict) = rownames(intr)
intr_predic_mat = cbind(intr_predic_mat,predic$predict[rownames(intr),])
intr_mse[i] = predic$mse
}

# diagnostics on the predictions...
boxplot.matrix(t(nc_predic_mat[1:30,]),las=2)
points(1:30,entrop[rownames(nc)][1:30],pch=19,col='red',cex=.5)

# for nc
ncmeaned = rowMeans(nc_predic_mat)
ncsded = apply(nc_predic_mat,1,sd)
nczscores = (entrop[rownames(nc)]-ncmeaned) / ncsded
hist(nczscores,20)

codmeaned = rowMeans(cod_predic_mat)
codsded = apply(cod_predic_mat,1,sd)
codzscores = (entrop[rownames(cod)]-codmeaned) / codsded
hist(codzscores,20)

utrmeaned = rowMeans(utr_predic_mat)
utrsded = apply(utr_predic_mat,1,sd)
utrzscores = (entrop[rownames(utr)]-utrmeaned) / utrsded
hist(utrzscores,20)

intrmeaned = rowMeans(intr_predic_mat)
intrsded = apply(intr_predic_mat,1,sd)
intrzscores = (entrop[rownames(intr)]-intrmeaned) / intrsded
hist(intrzscores,20)

allzs = append(nczscores, codzscores)
allzs = append(allzs, intrzscores)
allzs = append(allzs, utrzscores)
zs_w_preds = cbind(allzs, svm_strs[names(allzs),], annotation[names(allzs),]$annotation,logged[names(allzs)])
colnames(zs_w_preds)[1] = 'Z-score'
colnames(zs_w_preds)[4] = 'N Units'
colnames(zs_w_preds)[5] = 'Unit GC'
#colnames(zs_w_preds)[10] = 'Log(SD)'
colnames(zs_w_preds)[8] = 'Log(SD)'
plot(zs_w_preds[,-9],pch='.',cex.main=.6)

# nothing has anything to do with DHSs
par(mfrow=c(1,2))
logged = logged[as.character(annotation$ID)]

annotation$DHS[annotation$DHS>1] = 1

beeswarm(
  zs_w_preds$`Z-score` ~ annotation[rownames(zs_w_preds),'DHS'] * annotation[rownames(zs_w_preds),'annotation'],
  pch=19, ylab='Constraint score', las=2, cex = .3, xlab=''
  )

#TEs?
beeswarm(
  zs_w_preds$`Z-score` ~ annotation[rownames(zs_w_preds),'transposon'] * annotation[rownames(zs_w_preds),'annotation'],
  pch=19, ylab='Constraint score', las=2, cex = .25, main='1=associated with TE, 0=not associated with TE', xlab=''
) 

par(mfrow=c(1,1))

# arbitrary thresholds for stabilizing/diversifying selection
div_thresh = quantile(nczscores,.975)
stab_thresh = quantile(nczscores,.025)

codselneg = cbind(codzscores[codzscores<(stab_thresh)],strs[names(codzscores[codzscores<(stab_thresh)]),])
codselpos = cbind(codzscores[codzscores>div_thresh],strs[names(codzscores[codzscores>div_thresh]),])

var_strs = cbind(strs[names(allzs),], allzs)
tabled_var_strs = table(var_strs$Consensus)
var_strs = var_strs[var_strs$Consensus %in% names(which(tabled_var_strs>4)),]
boxplot(var_strs$allzs~var_strs$Consensus, las=2, ylab='Z-scores')

# coding still enriched?
background = table(annotation[rownames(zs_w_preds),'annotation']) / nrow(strs)
stab_sel = cbind(zs_w_preds[zs_w_preds$`Z-score`<stab_thresh,] , annotation[rownames(zs_w_preds[zs_w_preds$`Z-score`<stab_thresh ,]),])
div_sel = cbind(zs_w_preds[zs_w_preds$`Z-score`>div_thresh,] , annotation[rownames(zs_w_preds[zs_w_preds$`Z-score`>div_thresh ,]),])
sels = rbind(table(stab_sel$annotation),table(div_sel$annotation))
expected = rbind(
  background*nrow(stab_sel),
  background*nrow(div_sel)
)

par(mfrow=c(1,2))
barplot(sels,col=c('white','black'),beside=T,ylim=c(0,max(sels)+5),ylab='# STRs under selection')
points(c(1.5,4.5,7.5,10.5),expected[1,],bg='white',col='black',pch=22)
points(c(2.5,5.5,8.5,11.5),expected[2,],bg='white',col='black',pch=22)
legend(6,60,legend=c('Stabilizing','Diversifying'),fill=c('white','black'))
par(mfrow=c(1,1))

# main text fig comparing str categories
plot(density(nczscores),xlab='Z-score',main='bagged SVR prediction using intergenic STRs only',xlim=c(min(allzs),max(allzs)),col='black')
lines(density(codzscores),col='red')
lines(density(utrzscores),col='blue')
lines(density(intrzscores),col='green')
abline(v=div_thresh) #cutoff
abline(v=stab_thresh) #cutoff
legend(max(allzs)*.5,.17,legend=c('coding','noncoding','UTR','Intron'),fill=c('red','black','blue','green'))

# make color plot for bagged predictions
plot(ncmeaned,entrop[rownames(nc)],main=cor(ncmeaned,entrop[rownames(nc)]), 
     pch=19, xlim=c(-.2,.6), ylim=c(-.3, 1.0),cex=.6, xlab = 'Bagged SVR prediction', ylab='Log(SD)')
points(codmeaned,entrop[rownames(cod)],main=cor(codmeaned,entrop[rownames(cod)]), pch=19, col='red',cex=.6)
points(utrmeaned,entrop[rownames(utr)],main=cor(utrmeaned,entrop[rownames(utr)]), pch=19, col='blue',cex=.6)
points(intrmeaned,entrop[rownames(intr)],main=cor(intrmeaned,entrop[rownames(intr)]), pch=19, col='green',cex=.6)

# PCCs for bagged predictions
print('PCCs for CV SVR mod by genomic location')
print(cor.test(ncmeaned,entrop[rownames(nc)]))
print(cor.test(utrmeaned,logged[rownames(utr)]))
print(cor.test(intrmeaned,logged[rownames(intr)]))
print(cor.test(codmeaned,logged[rownames(cod)]))

# are deviations systematic??? not really...?
#plot(ncmeaned,nczscores,main=cor(nczscores,ncmeaned),xlim=c(-.5,.5),ylim=c(-12,12),cex=.2)
plot(ncmeaned,nczscores,main=cor(nczscores,ncmeaned),xlim=c(-.2,.7),
     ylim=c(min(allzs),max(allzs)),cex=.5, pch=19, xlab='Predicted Log(SD)',ylab='Z-score')
points(intrmeaned,intrzscores,main=cor(intrzscores,intrmeaned),col='green',xlim=c(-.5,.5), pch=19, ylim=c(-10,10),cex=.5)
points(codmeaned,codzscores,main=cor(codzscores,codmeaned),col='red',xlim=c(-.5,.5), pch=19, ylim=c(-10,10),cex=.5)
points(utrmeaned,utrzscores,main=cor(utrzscores,utrmeaned),col='blue',xlim=c(-.5,.5), pch=19, ylim=c(-10,10),cex=.5)

#predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids = nc_test,response = entrop)

# pull out datasets representing each class of str/selxn regime
# these are all written to spreadsheets
nc_div_sel = cbind(zs_w_preds[zs_w_preds$`Z-score`>div_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='Intergenic',], annotation[rownames(zs_w_preds[zs_w_preds$`Z-score`>div_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='Intergenic',]),])
utr_div_sel = cbind(zs_w_preds[zs_w_preds$`Z-score`>div_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='UTR',],annotation[rownames(zs_w_preds[zs_w_preds$`Z-score`>div_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='UTR',]),])
intr_div_sel = cbind(zs_w_preds[zs_w_preds$`Z-score`>div_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='intron',], annotation[rownames(zs_w_preds[zs_w_preds$`Z-score`>div_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='intron',]),])
intr_stab_sel = cbind(zs_w_preds[zs_w_preds$`Z-score`<stab_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='intron',], annotation[rownames(zs_w_preds[zs_w_preds$`Z-score`<stab_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='intron',]),])
utr_stab_sel = cbind(zs_w_preds[zs_w_preds$`Z-score`<stab_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='UTR',], annotation[rownames(zs_w_preds[zs_w_preds$`Z-score`<stab_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='UTR',]),])
nc_stab_sel = cbind(zs_w_preds[zs_w_preds$`Z-score`<stab_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='Intergenic',], annotation[rownames(zs_w_preds[zs_w_preds$`Z-score`<stab_thresh & zs_w_preds$`annotation[names(allzs), ]$annotation`=='Intergenic',]),])

# give all identical consensus units for a consensus unit by frameshifting
enum_frames = function(string) {
  id = c(string)
  for (i in 1:(str_length(string)-1)) {
    string = paste(substr(string,2,str_length(string)), substr(string,1,1), sep='')
    id = append(id,string)
  }
  return(sort(id))
}

# take vector of units, count while accounting for degeneracy/RC
summarize_dedup_consensus = function(unit_vec) {
  require(seqinr)
  counts = c()
  maps = c()
  for (unit in unit_vec) {
    if (!(unit %in% names(maps))) {
      rt_unit = str_to_upper(paste(rev(comp(str_split(unit,'')[[1]])),collapse=''))
      degens = sort(append( enum_frames(unit), enum_frames(rt_unit) ))
      for ( degen in degens ) {
        maps[degen] = degens[1]
      } 
      counts[maps[degen]] = 1
    } else {counts[maps[unit]] = counts[maps[unit]] + 1}
  }
  return(counts[sort(names(counts))])
}

# named vector x, names bkgrd, add 0 for missing names in x
expand_w_zeros = function(x,bkgrd) {
  x[bkgrd[!(bkgrd %in% names(x))]] = 0
  return(x)
}

# specifically for motif abundances
plot_deduped_consensus_freqs = function(div,stab,bkgrd,thresh=1,main) {
  bkgrd = summarize_dedup_consensus(bkgrd$Consensus)
  profs= rbind(
    expand_w_zeros(summarize_dedup_consensus(stab$Consensus),names(bkgrd))[names(bkgrd)],
    expand_w_zeros(summarize_dedup_consensus(div$Consensus),names(bkgrd))[names(bkgrd)]
  )
  to_plot = profs[,colSums(profs)>thresh]
  x_coords = seq(from = 1.5, to = 3*ncol(to_plot)+.5, by=3) # add one for beside col
  expected = sum(profs[1,]) * bkgrd[colnames(to_plot)] / sum(bkgrd)
  expected = rbind(expected, sum(profs[2,]) * bkgrd[colnames(to_plot)] / sum(bkgrd))
  barplot(to_plot, beside=TRUE, las=2, col=c('white','black'), ylim=c(0,max(expected)+2), ylab='Number of STRs',main=main)
  points(append(x_coords,x_coords+1), append(expected[1,],expected[2,]),pch=22,bg='white',col='black',cex=.6)
}

# plot out unit consensus distributions for all classes selected STRs
# white is diversifying, black is stabilizing
par(mfrow=c(2,2))
plot_deduped_consensus_freqs(codselpos,codselneg,annotation[annotation$annotation=='coding',],thresh=0,'Coding')
plot_deduped_consensus_freqs(intr_div_sel,intr_stab_sel,annotation[annotation$annotation=='intron',],thresh=0,'Introns')
plot_deduped_consensus_freqs(utr_div_sel,utr_stab_sel,annotation[annotation$annotation=='UTR',],thresh=0,'UTRs')
plot_deduped_consensus_freqs(nc_div_sel,nc_stab_sel,annotation[annotation$annotation=='Intergenic',],thresh=0,'Intergenic')
par(mfrow=c(1,1))

# find str expansions (relative)
# takes str allele distribution as input
find_expansions = function(matrix,str_id) {
  distr = as.numeric(matrix[str_id,])
  if (length(distr)==0) {return(NA)}
  med = median(na.omit(distr))
  meded = (distr-med) / med
  return(meded)
}

medea = all_genos
medea[!(is.na(medea))] = NA
for (str in rownames(all_genos)) {
  medea[str,] = find_expansions(all_genos,str )
}
max_med = apply(medea,1,max,na.rm=TRUE)

max_in_strain = apply(medea,2,max,na.rm=TRUE)
expanded = max_in_strain[max_in_strain>=2]
cat('num strains with >=1 expanded STR (>=2X median copy number):',length(expanded),'of',length(max_in_strain),'strains, involving',length(which(max_med>=2)),'STRs.\n')

hist(max_med,40,xlab='Expansion score\n(max unit number / median unit number)',main='')
expanded = na.omit(medea[medea>=2])  #totally arbitrary

# ntm1 intronic str was interesting (expansion score ~ 6):
hist(as.numeric(genos['65400',]),20,main='',xlab='NTM1 intron STR copy number',xlim=c(0,20))
hist(as.numeric(genos['47883',]),10,main='',xlab='MEE36 intron STR copy number',col='black')

# repopulate this, to get back filtered-out strs
annotation = read.delim('araport_annot/Ath_STRs_full_annotations_052317.tsv',header=T)
rownames(annotation) = annotation$ID  

print(table(annotation[rownames(genos),'annotation']))

# chars of the expanded strs
expansion_strs = cbind(annotation[names(max_med[max_med>=2]),], max_med[names(which(max_med>=2))])
print(table(expansion_strs$annotation))

# write out a table of expanded things
write.table(expansion_strs, 'output/expanded_strs_table_s2_010917.txt',sep='\t',quote=FALSE)
table(expansion_strs$annotation)

# avg missing genotypes for expanded STRs 
exp_missing = sum(nacount[rownames(expansion_strs)])/(nrow(expansion_strs) * 96)

# avg missing genotypes for non-expanded STRs with at least two calls
# (possible to have a non-zero expansion score)
non_exp = names(nacount)[
  !(names(nacount) %in% rownames(expansion_strs))
  & nacount < 95 ]
non_exp_missing = sum(nacount[non_exp])/ (length(non_exp) * 96)

cat('expanded STRs have',exp_missing,'proportion missing data compared to', 
    non_exp_missing, 'for comparable STRs\n')

print('difference in missing data:')
print(wilcox.test(nacount[as.character(expansion_strs$ID)],nacount[!(names(nacount) %in% as.character(expansion_strs$ID)) & nacount<95]))

# bin4 intron STR analysis
tas = annotation[rownames(genos),]
tas = rownames(tas[tas$Consensus %in% c('AT','TA') & tas$Purity==100,])
ta_meds = apply(genos[tas,],1,median,na.rm=TRUE)
tas = rownames(genos[tas,][ta_meds >=7 & ta_meds <= 12,])
ta_to_comp = unlist(genos[tas,] - apply(genos[tas,],1,median,na.rm=TRUE))

plot(density(ta_to_comp,na.rm=TRUE), ylim=c(0,.8), lwd=2, 
     main='BIN4 intron STR compared to other TAs', xlab='Difference from median copy number')
lines(density(unlist(genos['87547',])-median(unlist(genos['87547',]),na.rm=TRUE), na.rm=TRUE),col='red', lwd=2)
legend(8,.75, legend = c('BIN4 STR','TA/AT STRs'), fill=c('red','black'),cex=.75) #, bty=n) # knitr doesn't like the bty parameter?

###
# CALLING EXTERNAL SCRIPTS THAT MAKE FIGURES ETC.
###

source('code/AthMips_figs1_design_010217.R')
source('code/qpcr_analysis_65400_47883_2bioreps_100216.R')
source('code/str_pheno_assoc_post_analysis_032017.R')
source('code/effectsize_plot_032417.R')
source('code/cmt2_analysis.R')

########
# compare the SVR predictions to a case where we train on everything
# potentially controls for compositional differences between STR classes,
# but loses the nice property of removing selection. 
########

inter_allzs = allzs # stash this object for comparison

# the following duplicates code above- but never packaged well in a fn, so will have to do.
for (i in 1:1000) {
  if (i%%64 == 0) {print(i)}
  rand_strs = sample(rownames(sd_genos),replace=TRUE)
  train = rand_strs[1:train_size]
  test = rand_strs[(train_size+1):length(rand_strs)]
  fit = svr_fit(data = svm_strs,response = entrop,str_ids = train)
  predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=ncs,response=entrop)
  mses[i] = predic$mse
  cors[i] = predic$cor
  rownames(predic$predict) = ncs  # apparently this is a matrix...
  nc_predic_mat = cbind(nc_predic_mat, predic$predict[rownames(nc),])
  # coding predictions -  should have just wrapped in a fn
  predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(cod),response=entrop)
  cod_mses[i] = predic$mse
  rownames(predic$predict) = rownames(cod)  # apparently this is a matrix...
  cod_predic_mat = cbind(cod_predic_mat,predic$predict[rownames(cod),])
  # utr predictions
  predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(utr),response=entrop)
  rownames(predic$predict) = rownames(utr)
  utr_predic_mat = cbind(utr_predic_mat,predic$predict[rownames(utr),])
  utr_mse[i] = predic$mse
  # intron predictions
  predic = svr_pred_eval(svr=fit,newdata=svm_strs,str_ids=rownames(intr),response=entrop)
  rownames(predic$predict) = rownames(intr)
  intr_predic_mat = cbind(intr_predic_mat,predic$predict[rownames(intr),])
  intr_mse[i] = predic$mse
}

ncmeaned = rowMeans(nc_predic_mat)
ncsded = apply(nc_predic_mat,1,sd)
nczscores = (entrop[rownames(nc)]-ncmeaned) / ncsded
hist(nczscores,20)

codmeaned = rowMeans(cod_predic_mat)
codsded = apply(cod_predic_mat,1,sd)
codzscores = (entrop[rownames(cod)]-codmeaned) / codsded
hist(codzscores,20)

utrmeaned = rowMeans(utr_predic_mat)
utrsded = apply(utr_predic_mat,1,sd)
utrzscores = (entrop[rownames(utr)]-utrmeaned) / utrsded
hist(utrzscores,20)

intrmeaned = rowMeans(intr_predic_mat)
intrsded = apply(intr_predic_mat,1,sd)
intrzscores = (entrop[rownames(intr)]-intrmeaned) / intrsded
hist(intrzscores,20)

allzs = append(nczscores, codzscores)
allzs = append(allzs, intrzscores)
allzs = append(allzs, utrzscores)

two_mods_cor = cor(allzs, inter_allzs)

# looks very similar. probably no big effect of different training sets.
par(mfrow = c(1,2))
plot(allzs[ncs], inter_allzs[ncs],pch=19,cex=.25,xlim=c(-7,15),ylim=c(-7,15), 
     xlab='Bagged SVR using all STRs', ylab='Bagged SVR using intergenic STRs only',
     main=paste('PCC =',round(two_mods_cor,2)) )
points(allzs[rownames(intr)], inter_allzs[rownames(intr)],pch=19,cex=.25,col='green')
points(allzs[rownames(utr)], inter_allzs[rownames(utr)],pch=19,cex=.25,col='blue')
points(allzs[rownames(cod)], inter_allzs[rownames(cod)],pch=19,cex=.25,col='red')

# re-infer selection thresholds
div_thresh = quantile(nczscores,.975)
stab_thresh = quantile(nczscores,.025)

plot(density(nczscores),xlab='Z-score',main='bagged SVR prediction using all STRs',xlim=c(min(allzs),max(allzs)),col='black')
lines(density(codzscores),col='red')
lines(density(utrzscores),col='blue')
lines(density(intrzscores),col='green')
abline(v=div_thresh) #cutoff
abline(v=stab_thresh) #cutoff
legend(max(allzs)*.5,.17,legend=c('coding','noncoding','UTR','Intron'),fill=c('red','black','blue','green'))
par(mfrow=c(1,1))

# rm these data objects to avoid confusion going forward
rm(allzs, nczscores, codzscores, utrzscores, intrzscores, intr_predic_mat, cod_predic_mat, nc_predic_mat, utr_predic_mat)
