#!/usr/bin/env Rscript



args = commandArgs(trailingOnly=TRUE)
library(dplyr)
library(ade4)
library("optparse")

option_list = list(
  make_option(c("-i","--input"), type="character", default=NULL,
              help="Input filename", metavar="character"),
  make_option(c("-o","--output"), type="character", default="out.tsv",
              help="output prefix [default= %default]", metavar="character"),
  make_option(c("-r","--roNOG"), type="character", default=NULL,
              help="roNOG file", metavar="character"),
  make_option(c("-b","--biomart"), type="character", default=NULL,
              help="Biomart mgi file", metavar="character")
);

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

if (is.null(opt$input)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input)", call.=FALSE)
}

if (is.null(opt$roNOG)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (roNOG)", call.=FALSE)
}

if (is.null(opt$biomart)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (biomart)", call.=FALSE)
}

InputBlastx = opt$input
Output = opt$output
roNOG = opt$roNOG
biomart = opt$biomart


#Cluster EggNOG assignation to Ensembl gene ID
clNOG = read.csv(roNOG, h=F, sep = "\t")

###Preparation of the clNOG table (reorganize EggNOG assignation to split column 6 and obtain 1:1 cluster:gene)

#head(clNOG)

colnames(clNOG) = c("dataset","ClusterID","NbOfProteins","NbOfspecies","GeneID","TaxonID")
#dim(clNOG)


#Creation of a new clNOG table with splitting the col 6
v=gsub( ","," ",as.character(clNOG$"GeneID"))
vl=lapply(v,function(x){strsplit(x," ")})
n=lapply(v,function(x){length(strsplit(x," ")[[1]])})
ul=unlist(vl)
g=rep(clNOG$ClusterID,n)
clNOGok=data.frame(ClusterID=g,GeneID=ul)


#enleve le numero devant  GeneEnsProtID
cm=gsub( "^[0-9]*.","",as.character(clNOGok$"GeneID"))
cml=lapply(cm,function(x){strsplit(x," ")})
cmn=lapply(cm,function(x){length(strsplit(x," ")[[1]])})
cmul=unlist(cml)
cmg=rep(clNOGok$ClusterID,cmn)
clNOGokMus2=data.frame(ClusterID=cmg,GeneEnsProtID=cmul)


#Use of Ensembl ID converter to get the IDs from v70 to v88 (current one)

#Import Ensembl IDs, lengths and MGI IDs (downloaded from Core Ensembl release 70)
BiomartMus = read.csv(biomart, h=F, sep = "\t")
BiomartMus = BiomartMus[-1,]
colnames(BiomartMus) = c("EnsGeneID","EnsTranscriptID","MGI","EnsTranscriptLen","GeneEnsProtID")


#blastx de rongeurs

#BlastX results query=transcriptome contigs and subject=Ensembl gene ID (not ordered in EggNOG clusters)
fich_blast = read.csv(InputBlastx, h=F, sep = "\t")
#colnames(fich_blast) = c("ContigID","GeneID","Evalue","BitScore","Length","pident","qstart","qend","sstart","send")
colnames(fich_blast) = c("ContigID","GeneID","Evalue","BitScore","Length","pident","qstart","qend","sstart","send","qlen")


##############################################################################################################
fich_blast=fich_blast[fich_blast$Length > 50 & fich_blast$pident > 60,] #filter 

#merger pour avoir mgi ajouter de l'info
Annot1a <- merge(x=clNOGokMus2, y=BiomartMus, by.x=c("GeneEnsProtID"), by.y=c("GeneEnsProtID"), all=F)
Annot1a <- subset(Annot1a, select=c("ClusterID","MGI"))

#countClust <- table(Annot1a$ClusterID)
#Clust1 <- names(countClust)[countClust == 1]
#Annot1a_NoPara <- Annot1a[Annot1a$ClusterID %in% Clust1,]

#countMgi <- table(Annot1a$MGI)
#Mgi1 <- names(countMgi)[countMgi == 1]
#Annot1a_NoPara2 <- Annot1a_NoPara[Annot1a_NoPara$MGI %in% Mgi1,]

Annot1b <- merge(x=clNOGok, y=fich_blast, by=c("GeneID"), all=F)



Annot1a$MGI_ClusterID=paste(Annot1a$MGI,Annot1a$ClusterID, sep="_")

#Annot2 <- merge(x=Annot1a_NoPara2, y=Annot1b, by=c("ClusterID"), all=F)
Annot2 <- merge(x=Annot1a, y=Annot1b, by=c("ClusterID"), all=F)
#Annot2 <- subset(Annot2, select = c("ClusterID","ContigID","MGI","BitScore" ,"qlen"))
Annot2 <- subset(Annot2, select = c("ClusterID","MGI_ClusterID","ContigID","MGI","BitScore" ,"qlen"))
#################################################################################################################################################
blast_1percontig_Int = Annot2 %>% group_by(ClusterID) %>% top_n( 1, BitScore) #Only keep the longer query per cluster/ or best hit

#blast_1percontig_Int2 <- subset(blast_1percontig_Int, select = c("ClusterID","ContigID","MGI" , "qlen"))
#blast_1percontig_Int2 <- subset(blast_1percontig_Int, select = c("ClusterID","ContigID","MGI" , "qlen","MGI_ClusterID"))
blast_1percontig_Int2 <- subset(blast_1percontig_Int, select = c("ClusterID","ContigID","MGI_ClusterID" , "qlen"))
usub <- unique(blast_1percontig_Int2)

if (nrow(blast_1percontig_Int2) == 0){
  stop(paste("ERROR No gene for this species (", InputBlastx, ")"), call.=FALSE)
}

countClust2 <- table(blast_1percontig_Int2$ClusterID)
Clust2 <- names(countClust2)[countClust2 > 1]
blast_1percontig_DupliBit <- blast_1percontig_Int2[blast_1percontig_Int2$ClusterID %in% Clust2,]
uDupliBit <- unique(blast_1percontig_DupliBit$ClusterID)
indexok = 1:nrow(blast_1percontig_Int2)
indexok = indexok[! indexok %in% which(blast_1percontig_Int2$ClusterID %in% uDupliBit)]
#ça marche car on a grouper en mettant le meilleur en premier 
indexok = c(indexok, match(uDupliBit,blast_1percontig_Int2$ClusterID))
blast_1percontig_Int3 <- blast_1percontig_Int2[ indexok ,]
#WARNING change the end taking the longest contig instead of removing

print(InputBlastx)
print(apply(blast_1percontig_Int3,2,function(x){length(unique(x))}))

write.table(file =  Output  ,blast_1percontig_Int3,quote = F,sep = "\t", row.names = F )
