#!usr/bin/Rscript
library(stringr)

# set up data structure to receive info, cannibalize largely from the prior str file from KC
orig = read.csv('plantMicroSats-1KC-5-24.csv',header=T,stringsAsFactors=FALSE)
new = orig[,c("ID","Unit.Size","X..of.Units","Purity","TRF.score","VARscore","Consensus","Size.of.Locus","annotation","sanna.s.annotations")]
rownames(new) = as.character(new$ID)

# read in data counting gff entries for each str (0s are intergenic)
# do some prelim population of structure with this
count = read.table('araport_annot/overlap_STR_araport11_genes_transposons_countsonly_111516_restrict50.gff',sep='\t',stringsAsFactors=FALSE)
colnames(count) = c('chr','method','what','start','stop','ID','nothing','strand','count')
rownames(count) = count$ID
count = count[as.character(new$ID),]
new$annotation[count$count == 0] = 'Intergenic'
start = count$start
stop = count$stop
chr = count$chr
transposon = rep(0,nrow(new))
all_annotation = rep(NA,nrow(new))
gene = rep(NA,nrow(new))
new = cbind(new,chr,start,stop,transposon,gene,all_annotation)

#print('reading')
# read in data actually displaying gff entries
gff = read.table('araport_annot/overlap_STR_araport11_genes_transposons_111516_restrict50.gff',sep='\t',stringsAsFactors=FALSE,allowEscape=TRUE)
colnames(gff) = c('chr','method','what','start','stop','ID','nothing','strand','chr2','how','annot','start2','stop2','nothing','strand2','frame','info')

print('now at analysis')
# function for extracting annotations from slightly confusing GFF multi-entry situation,
# given str id, also getting gene info if there

# with sub-function for extracting a gene ID from last field string of GFF
#id_pat = 'ID=(AT[1-5]G[0-9]{5})'
get_gene = function(str,id_pat) {
    # just need to find the gene id...
    gene = strsplit(str,';')[[1]][1]
    return(as.character(gene))
    }
    
extract_annots = function(gff, id, new) {
    id = as.character(id)
    annots = gff[gff$ID == id,]
    #print(id)
    #print(annots)
    
    new[id,'all_annotation'] = paste(annots$annot, collapse=',')
    # handle pseudogenes- just toss em
    if ( 'pseudogene' %in% annots$annot ) {
        new[id,'annotation'] = 'Intergenic'
        return(new)
        }

    # handle TEs- which may or may not be interesting dependening on context
    if ('transposable_element_gene' %in% annots$annot ) {
        new[id,'annotation'] = 'Intergenic'
        new[id,'transposon'] = 1
        new[id,'gene'] = get_gene(annots[1,'info'][1])
        return(new)
        } else if (!('gene' %in% annots$annot)) { 
            new[id,'annotation'] = 'Intergenic'
            new[id,'transposon'] = 1
            new[id,'gene'] = get_gene(annots[1,'info'][1])
            return(new)
        } else if ( 'transposable_element' %in% annots$annot ) {
        TE = annots[annots$annot == 'transposable_element',]
        gene = annots[annots$annot == 'gene',]
        TE_coord = unlist(TE[c('start2','stop2')])
        TE_size = as.numeric(TE_coord[2]) - as.numeric(TE_coord[1])
        gene_coord = unlist(gene[c('start2','stop2')])
        gene_size = as.numeric(gene_coord[2]) - as.numeric(gene_coord[1])
        new[id,'transposon'] = 1
        # my crappy heuristic for whether it seems relevant
        if ( TE_size > 200 & TE_size > (gene_size / 2)) {
        new[id,'annotation'] = 'Intergenic'
        new[id,'transposon'] = 1
        new[id,'gene'] = get_gene(annots[1,'info'][1])
        return(new)
        }
    # handle intron/UTRs        
        }
    if ( 'pseudogene' %in% annots$annot ) {
        new[id,'annotation'] = 'Intergenic'
        }
    if ('five_prime_UTR' %in% annots$annot | 'three_prime_UTR' %in% annots$annot | 'transcript_region' %in% annots$annot) {
        #print(id)
        new[id,'annotation'] = 'UTR'
        # parse out a gene ID
        new[id,'gene'] = get_gene(annots[annots$annot=='exon','info'][1])
        } else if ('CDS' %in% annots$annot) {
        new[id,'annotation'] = 'coding'
        # parse out a gene ID
        new[id,'gene'] = get_gene(annots[annots$annot=='CDS','info'][1])
        } else if ('protein' %in% annots$annot) {
        new[id,'annotation'] = 'intron'
        # parse out a gene ID
        new[id,'gene'] = get_gene(annots[annots$annot=='gene','info'][1]) 
        } else {
        # not officially UTRs in the same way, but tscribed apparently...
        new[id,'annotation'] = 'UTR'
        # parse out a gene ID
        new[id,'gene'] = get_gene(annots[annots$annot=='gene','info'][1]) 
        }
    return(new)
    }

# read in DHS data, use to annotate DHS (regulatory) sites
dhs = read.table('araport_annot/overlap_dhs_031317_restrict50.txt',sep='\t',stringsAsFactors=FALSE)
colnames(dhs) = colnames(count)
colnames(dhs)[9] = 'DHS'
DHS = dhs$DHS
new = cbind(new,DHS)

# run function on each unique str id in main gff entry output
# done stupidly but it works
for (i in as.character(unique(gff$ID))) {
    new = extract_annots(gff, i, new)
    }

new$gene = gsub('ID=','',new$gene)
new$gene = gsub(':[a-zA-Z]+:[0-9]+','',new$gene)

write.table(new, 'araport_annot/Ath_STRs_full_annotations_031317.tsv', sep='\t', quote=FALSE)


