sourcefolder = "L:/members/Schon/Writing/nanoPARE/data_generated/"

anno_gff = read.table('L:/members/Schon/lab_files/Annotations/TAIR10/TAIR10_GFF3_genes_nod.gff',stringsAsFactors = F)
cds_anno = anno_gff[anno_gff[,3]=='protein',c(1,4,5,7,9)]
cds_anno[,5] = gsub("^(gene_id|ID|transcript_id)=(.+?)[;\\.].*$","\\2",cds_anno[,5])

anno_gff = read.table('L:/members/Schon/lab_files/Annotations/Araport11/Araport11_nod.gtf',stringsAsFactors = F)
cds_anno = anno_gff[anno_gff[,3]=='CDS',c(1,4,5,7,13)]

compare_to_orf = function(pos,gene_id){
  # Checks where a position is relative to the ORF
  if(gene_id == '.'){
    return('.')
  }
  if(gene_id%in%cds_anno[,5]){
    # AGI is associated with a protein
    subtable = cds_anno[cds_anno[,5]==gene_id,]
    if(length(unique(subtable[,4]))>1)stop("ERROR: same AGI has opposite stranded ORFs")
    comparison_value = sum(pos > c(min(subtable[,2]),max(subtable[,3])))
    strand = subtable[1,4]
    if(strand == '+'){
    # AGI is on the plus strand
    if(comparison_value == 0){
      # Position is less than both ORF ends
        return('UTR5')
      }else{
        if(comparison_value == 1){
          # Position is between the ORF ends
          return('CDS')
        }else{
          # Position is greater than both ORF ends
          return('UTR3')
        }
      }
    }else{
      # AGI is on the minus strand
      if(comparison_value == 0){
        # Position is less than both ORF ends
        return('UTR3')
        }else{
          if(comparison_value == 1){
            # Position is between the ORF ends
            return('CDS')
          }else{
            # Position is greater than both ORF ends
            return('UTR5')
          }
        }
      }
  }else{
    # AGI has no protein product
    return("NC")
  }
}

piecolors = c(
  'UPSTREAM'='#5DB4E5',
  'PROXIMAL'="#1C75BC",
  'DISTAL'="#808285",
  'INTRON'="#58BD81",
  'ANTISENSE'="#F38FBB",
  '5_UTR'="#8DC63F",
  'CDS'="#7C6EB2",
  '3_UTR'="#BE1E2D",
  'NONCODING'='#A97C50',
  'INTERGENIC'='white'
)

s = 'fb'
decorator='W'

bed = read.table(
  paste(sourcefolder,s,'.all_features.bed',sep=''),
  stringsAsFactors = F
)
rownames(bed) = bed[,4]

decorators = apply(bed,1,function(x)compare_to_orf(as.numeric(x[2])+as.numeric(x[5]),x[7]))
bed = cbind(bed,decorators)

capped = bed[grep('.capped.',rownames(bed),fixed = T),]
noncapped = bed[grep('.noncapped.',rownames(bed),fixed = T),]

abbreviations = c(
  'P'='PROXIMAL',
  'D'='DISTAL',
  'U'='UPSTREAM',
  'I'='INTRON',
  'A'='ANTISENSE',
  'IA'='ANTISENSE',
  'N'='INTERGENIC'
)

picked = c('INTERGENIC','ANTISENSE','INTRON','DISTAL','PROXIMAL','UPSTREAM')
tcap = table(capped[,9])
names(tcap) = abbreviations[names(tcap)]
tcap = tcap[picked]

tnoncap = table(noncapped[,9])
names(tnoncap) = abbreviations[names(tnoncap)]
tnoncap = tnoncap[picked]

pdf('feature_pies.pdf',useDingbats = F)
par(mfrow=c(1,2),mar=c(1,1,1,1))
pie(tcap,col = piecolors[picked],
    main='Capped features',init.angle = 180,border = NA)
pie(tnoncap,col = piecolors[picked],
    main='Noncapped features',init.angle = 180,border = NA)
dev.off()

cds_abbreviations = c(
  'UTR5'="5_UTR",
  'CDS'="CDS",
  'UTR3'="3_UTR",
  'NC'="NONCODING",
  '.'="INTERGENIC"
)

# picked = c('5_UTR','CDS','3_UTR','NONCODING','INTERGENIC')
picked = c('INTERGENIC','NONCODING','3_UTR','CDS','5_UTR')
tcap = table(capped[,10])
names(tcap) = cds_abbreviations[names(tcap)]
tcap = tcap[picked]

tnoncap = table(noncapped[,10])
names(tnoncap) = cds_abbreviations[names(tnoncap)]
tnoncap = tnoncap[picked]

pdf('feature_cds_pies.pdf',useDingbats = F)
par(mfrow=c(1,2),mar=c(1,1,1,1))
pie(tcap,col = piecolors[picked],
    main='Capped features',init.angle = 180,border=NA)
pie(tnoncap,col = piecolors[picked],
    main='Noncapped features',init.angle = 180,border=NA)
dev.off()


# bed[,9] = paste(bed[,9],bed[,10],sep='_')
# gsub('N_\\.','N',bed[,9])
# write.table(bed[,1:9],'L:/members/Schon/Writing/nanoPARE/manuscript_draft/manuscript_data/fb.all_features.bed',quote=F,sep='\t',row.names = F,col.names = F)


