cap_color_gradient = c(
  '#0d304c',
  '#1967a5',
  '#2a90e1',
  '#56a7e8'
)

data_directory = '../data_tables/'
# Load capped feature bed file
allowed_distance = 500
decorator = 'W'
capped_features = list()
noncapped_features = list()
genes_with_cap = list()
genes_with_terminal_noncap = list()
genes_with_internal_noncap = list()
sample_list = c('fb','1ng_fb','100pg_fb','10pg_fb')
for(s in sample_list){
  features = read.table(paste(data_directory,'fb.W.5P.bed',sep=''),stringsAsFactors = F)
  capped_features[[s]] = features[grep('\\.capped\\.',features[,4]),]
  noncapped_features[[s]] = features[grep('\\.noncapped\\.',features[,4]),]
  capped_features[[s]] = capped_features[[s]][capped_features[[s]][,9]%in%c('S','P','U','D','I'),]
  genes_with_cap[[s]] = unique(capped_features[[s]][,7])
  
  noncapped_features[[s]] = noncapped_features[[s]][noncapped_features[[s]][,9]%in%c('S','P','U','D','I'),]
  genes_with_terminal_noncap[[s]] = unique(noncapped_features[[s]][noncapped_features[[s]][,9]%in%c('U','S','P'),7])
  genes_with_internal_noncap[[s]] = unique(noncapped_features[[s]][noncapped_features[[s]][,9]%in%c('D','I'),7])
}

# Load TAIR10 gene types for each AGI
gene_types = read.table(paste(data_directory,'TAIR10_gene_types.txt',sep=''),row.names = 1,stringsAsFactors = F)
pcgenes = rownames(gene_types)[gene_types[,1]=='protein_coding']

master_genes = read.table(paste(data_directory,'smartseq2_TPM.tsv',sep=''),stringsAsFactors = F)
tpm_table = master_genes[,c('fb_1','fb_2','fb_3')]

# Mask chloroplast and mitochondrial genes, which are not expected to 
# have capped features. Select only protein-coding genes to remove
# ncRNAs (snoRNA, rRNA, tRNA) that will also not have capped 5P ends
mito_chloro_agis = c(
  grep('AT[CM]G',rownames(tpm_table),value = T),
  anno_in_range('2',3230000,3510000)
)
tpm_table = tpm_table[!(rownames(tpm_table) %in% mito_chloro_agis),]

thresholds = c(.1,c(c(2,5,10) %o% 10^(-1:1)))

null_thresh = 0.02
tvals = list()
nct_tvals = list()
nci_tvals = list()

# Get a baseline value for all genes, whether expressed or not
for(s in sample_list){
  tvals[[s]] = list()
  nct_tvals[[s]] = list()
  nci_tvals[[s]] = list()
  tvals[[s]][[as.character(null_thresh)]] = mean(names(which(rowMeans(tpm_table) == 0)) %in% genes_with_cap[[s]])
  nct_tvals[[s]][[as.character(null_thresh)]] = mean(names(which(rowMeans(tpm_table) == 0)) %in% genes_with_terminal_noncap[[s]])
  nci_tvals[[s]][[as.character(null_thresh)]] = mean(names(which(rowMeans(tpm_table) == 0)) %in% genes_with_internal_noncap[[s]])
  
  for(thresh in thresholds){
    threshtable = rowMeans(tpm_table) >= thresh
    picked = names(which(threshtable))
    cprop = mean(picked %in% genes_with_cap[[s]])
    nct_cprop =  mean(picked %in% genes_with_terminal_noncap[[s]])
    nci_cprop =  mean(picked %in% genes_with_internal_noncap[[s]])
    tvals[[s]][[as.character(thresh)]] = cprop
    nct_tvals[[s]][[as.character(thresh)]] = nct_cprop
    nci_tvals[[s]][[as.character(thresh)]] = nci_cprop
  }
}

pdf('FIG3C.cap_feature_sensitivity.pdf',width = 3,height = 2,pointsize = 8,useDingbats = F)
par(mar=c(4,4,0,0),lend='square',ljoin='mitre')

plot(
  x=as.numeric(names(tvals[[sample_list[1]]])),
  y=as.numeric(unlist(lapply(tvals[[sample_list[1]]],mean))),
  log='x',
  axes=F,
  type='b',
  pch=20,
  xlab='Smart-seq2 transcripts per million (TPM)',
  ylab='Percent with capped feature',
  ylim=c(0,1),
  xlim=c(null_thresh,100),
  col=cap_color_gradient[1]
)

for(s in 2:length(sample_list)){
  points(
    x=as.numeric(names(tvals[[sample_list[s]]])),
    y=as.numeric(unlist(lapply(tvals[[sample_list[s]]],mean))),
    type='b',
    pch=20,
    col=cap_color_gradient[s]
  )
}


axis(1,c(.03,.1,.2,.5,1,2,5,10,20,50,100),labels = c('n.d.','.1','.2','.5','1','2','5','10','20','50','100'))
axis(2,c(0,.25,.5,.75,1),c(0,25,50,75,100),las=1)
abline(v=c(.1,1,10,100),lty=2)
abline(h=c(0,1))
legend(
  'topleft',
  fill = cap_color_gradient[1:length(sample_list)],
  border = NA,
  bty='n',
  legend = c('5ng','1ng','100pg','10pg')
)

dev.off()


highly_expressed = names(which(rowSums(tpm_table >= 20)>=2))
print(highly_expressed[!highly_expressed%in%genes_with_cap[["fb"]]])

