# make a figure describing designed STR composition
# annotations of function (exon/DHS/whatever) do not match up!!!!
# (between genotyped, strs, picked strs and full set)
# last updated 1/2/2017
# refactored 3/25/17

# uncomment this and dev.off() below to make PDFs of figures
#pdf('athmip_suppfig1_112116.pdf')

# plotting function to keep all same
plot_str_summs = function(all,mip,hit,xlabel='',log.y) {
  fracs = round(hit / mip, 2)
  #print(round(fracs,2))
  to_plot = rbind(all,mip)
  to_plot = rbind(to_plot, hit)
  plotted = barplot(to_plot,beside=T,col=c('white','gray','black'), #ylim=c(0,2500), 
          las=2, ylab = 'Number of STRs', xlab = xlabel)
  # add %s of genotypes called
  text(x=.25+plotted[seq(3,length(hit)*3,by=3)], y=(hit+100), label=fracs, cex=.7, srt=90 )
  legend(length(to_plot) / 1.3,max(to_plot)-100,legend=c('All','Targeted','Typed'),fill=c('white','gray','black'),cex=.8)
  return(fracs)
  }

picked = read.csv('mip_design_troubleshoot/2100_mips_picked_021716.txt',header=T,sep='\t')

# replaced allstrs with a corrected araport11 annot thing
allstrs = read.delim('araport_annot/Ath_STRs_full_annotations_031317.tsv',sep='\t')

rownames(allstrs) = as.character(allstrs$ID)
pilot_calls = read.csv('mip_design_troubleshoot/benchmarked_results.txt',header=T,sep=' ')

calling = cbind(pilot_calls[,'open2_extend2_mismatch1150'],allstrs[rownames(pilot_calls),"X..of.Units"])
calling[calling<0] = NA
rownames(calling) = rownames(pilot_calls)
calling = na.omit(calling)
accuracy = length(which(calling[,1]==calling[,2])) / nrow(calling)

#pdf('MIPSTR_vs_true_calls_120216.pdf')
 plot(calling, xlim=c(0,50),ylim=c(0,50),cex=.2,pch=19, 
      xlab='MIPSTR allele length calls', ylab='True allele length',
      main=paste('Accuracy = 96%, r = 0.99'))
# #dev.off()

pilot_calls = rownames(pilot_calls)[ !is.na(pilot_calls[,'open2_extend2_mismatch1150']) ]
covqu = read.table('mip_design_troubleshoot/Col_cov_qual_081816.txt',header=T)

stripped_ids = gsub('_[0-9A-Za-z_]+','',picked$mip_name)
rownames(picked) = stripped_ids
annots = allstrs[allstrs$ID %in% stripped_ids,]
pilot = allstrs[allstrs$ID %in% pilot_calls,]

# restrict all strs to just those with level of purity used in targeted exons
allstrs = allstrs[allstrs$Purity >= 89,]

miptable = table(annots$annotation)


piltable = table(pilot$annotation)


# set annotations
alltable = table(allstrs$annotation)

par(mfrow = c(2,3))
plot_str_summs(alltable,miptable,piltable,xlabel='')

miptable = table(annots$Purity)
alltable = table(allstrs$Purity)
piltable = table(pilot$Purity)

plot_str_summs(alltable,miptable,piltable,xlabel='Purity (%)')

miptable = table(annots$Unit.Size)
alltable = table(allstrs$Unit.Size)
piltable = table(pilot$Unit.Size)

plot_str_summs(alltable,miptable,piltable,xlabel='Unit Size')

histogram = hist(annots$Size.of.Locus,breaks = c(20,40,60,80,100,120,140,160,180), plot=FALSE)
miptable = histogram$counts
names(miptable) = histogram$mids
histogram = hist(allstrs$Size.of.Locus,breaks = c(20,40,60,80,100,120,140,160,180), plot=FALSE)
alltable = histogram$counts
names(alltable) = histogram$mids
histogram = hist(pilot$Size.of.Locus,breaks = c(20,40,60,80,100,120,140,160,180), plot=FALSE)
piltable = histogram$counts
names(piltable) = histogram$mids

plot_str_summs(alltable,miptable,piltable,xlabel='Locus size')

mipunits = table(annots$Consensus)
allunits = table(allstrs$Consensus)
pilunits = table(pilot$Consensus)
miptable = mipunits[rownames(allunits[order(allunits,decreasing=TRUE)])[1:10]]
piltable = pilunits[rownames(allunits[order(allunits,decreasing=TRUE)])[1:10]]
alltable = allunits[order(allunits,decreasing=TRUE)][1:10]

plot_str_summs(alltable,miptable,piltable,xlabel='Unit consensus')

cat('overall genotyping success rate:',sum(pilunits) / sum(mipunits),'\n' )
cat('success rate split into AT/TA vs. others:\n','AT/TA: ',(pilunits['AT']+pilunits['TA']) / (mipunits['AT']+mipunits['TA']), 
    '\nall others: ', unname((sum(pilunits) - (pilunits['TA']+pilunits['AT'])) / (sum(mipunits) - (mipunits['TA']+mipunits['AT']))),sep='')



# now look at why certain MIPs failed, whether that was predictable from mipgen logistic score
# this file is slightly edited output from script
# 0217_2016_5strains/code/tabulateMaxqualCoveragePerMip.sh 
covqu = read.table('mip_design_troubleshoot/Col_cov_qual_081816.txt',header=T)
coved_log = cbind(covqu,picked[as.character(covqu$STR),'logistic_score'])

bench = read.csv('mip_design_troubleshoot/benchmarked_results.txt',header=T,sep=' ')
bench[bench<0] = NA
num_cols = ncol(bench)
missing = as.character(covqu$STR)[!(as.character(covqu$STR) %in% rownames(bench) )]
bench[missing,] = rep(NA,num_cols)

coved_log = cbind(coved_log,bench[as.character(covqu$STR),'open2_extend2_mismatch1150'])
#coved_log = cbind(coved_log,!is.na(bench[,'open2_extend2_mismatch1150']))

coved_log = cbind(coved_log, !(is.na(coved_log[,5])))

coved_log[coved_log[,6]==TRUE,6] = 'Call'
coved_log[coved_log[,6]==FALSE,6] = 'No call'
coved_log = cbind(coved_log, as.character(as.vector(allstrs[as.character(covqu$STR),'Consensus'])))

colnames(coved_log) = c('STR','Colqual','Colcov','MIP logistic score','MIPSTR call','Called','Consensus')

# now plot a bunch of stats on MIPs and sequencing
par(mfrow=c(2,2))

# distribution of coverage
h = hist(covqu[,'Colcov'],5000,plot=FALSE)
plot(h$mids,h$counts,log='x',xlab='Coverage',ylab='# STRs',type='l')

# among those with low coverage, look at it.
length(which(covqu[,'Colcov']<50))
covqual_cor = cor(coved_log[,'Colqual'],log(coved_log[,'Colcov']+1))
plot(coved_log[,'Colqual'], (1+coved_log[,'Colcov']), log='y', pch='.', xlab='Maximum quality',ylab='Coverage',main=paste('r = ',round(covqual_cor,2),sep=''))
abline(v=150)

# also report correlation coefficients for relationships- both of these are p < 2.2e-16.
cov_cor = cor(coved_log[,'MIP logistic score'],log(coved_log[,'Colcov']+1))
plot(coved_log[,'MIP logistic score'],(coved_log[,'Colcov']+1),log='y',pch='.',xlab='MIP logistic score',ylab='Coverage',main=paste('r = ',round(cov_cor,2),sep=''))
qual_cor = cor(coved_log[,'MIP logistic score'],coved_log[,'Colqual'])
plot(coved_log[,'MIP logistic score'],coved_log[,'Colqual'],pch='.',xlab='MIP logistic score',ylab='Maximum quality',main=paste('r = ',round(qual_cor,2),sep=''))
abline(h=150)

par(mfrow=c(2,3))
calls = table(coved_log[,'Called'])
boxplot(coved_log[,'Colqual'] ~ coved_log[,'Called'],width=calls,ylab='Maximum quality')
boxplot((1+coved_log[,'Colcov']) ~ coved_log[,'Called'],width=calls,log='y',ylab='Coverage')
boxplot(coved_log[,'MIP logistic score'] ~ coved_log[,'Called'],width=calls, ylab='MIP logistic score')

# now look just a little at the AT stuff
levels(coved_log[,'Consensus']) = append(levels(coved_log[,'Consensus']),'Other')
coved_log[coved_log[,'Consensus'] == 'TA','Consensus'] = 'AT'
coved_log[coved_log[,'Consensus'] != 'AT','Consensus'] = 'Other'
#par(mfrow=c(1,2))
boxed = boxplot(coved_log[,'MIP logistic score'] ~ coved_log[,'Called'] * factor(coved_log[,'Consensus'],levels=c('AT','Other')),plot=FALSE)
boxplot((1+coved_log[,'Colcov']) ~ coved_log[,'Called'] * factor(coved_log[,'Consensus'],levels=c('AT','Other')),width=boxed$n,log='y',ylab='Coverage',las=2)
boxplot(coved_log[,'MIP logistic score'] ~ coved_log[,'Called'] * factor(coved_log[,'Consensus'],levels=c('AT','Other')),width=boxed$n,ylab='MIP logistic score',las=2)
boxplot(coved_log[,'Colqual'] ~ coved_log[,'Called'] * factor(coved_log[,'Consensus'],levels=c('AT','Other')),width=boxed$n,ylab='Maximum quality',las=2)

#dev.off()

par(mfrow=c(1,1))

dideoxy = read.csv('validations/str_validation_dideoxy_data.csv',header=T)
nona_dideoxy = na.omit(cbind(dideoxy$mipstr_allele,dideoxy$dideoxy_allele))
cor = cor(nona_dideoxy)[1,2]
accuracy = length(which(nona_dideoxy[,1] == nona_dideoxy[,2])) / nrow(nona_dideoxy)

#pdf('mipstr_vs_dideoxy_data_122916.pdf')
plot(dideoxy$mipstr_allele,dideoxy$dideoxy_allele, 
     pch=19, xlab='MIPSTR call', ylab='Dideoxy call',
     main=paste('Accuracy:',round(accuracy,2),'\nPCC:',round(cor,3))
     )
#dev.off()

# technical reproducibility
bigexpt = read.table('problem_mip_genotypes_081016.txt',header=T)[rownames(calling),'max.1_S1']
twocalls=na.omit(cbind(calling[,1],bigexpt))

print('evaluating technical reproducibility between pilot and big expts for Col')
repro_cor = cor.test(twocalls[,1],twocalls[,2])
print(repro_cor)
same = length(which(twocalls[,1]==twocalls[,2]))
total = nrow(twocalls)
cat('number calls the same for loci called in both:',same,'out of',total,'total, for accuracy:', same/total)
#pdf('tech_repro_col_010217.pdf')
plot(twocalls,xlab='Pilot Col call',ylab='Full experiment Col call', cex=.5, pch=19,
     main=paste('Accuracy:',round(same/total,3),'\nPCC:',round(repro_cor$estimate,3)) )
#dev.off()

