library(seqinr)

# 1. load fasta sequences from files
## files come from ensembl and contain ensembl transcript ID and the sequence in a multi fasta format
utr5 <- read.fasta(file = "data/5utr_seq_mm10.fa", as.string = TRUE, seqtype = "DNA")
utr3 <- read.fasta(file = "data/3utr_seq_mm10.txt.gz", as.string = TRUE, seqtype = "DNA")
cds <- read.fasta(file = "data/cds_seq_mm10.txt.gz", as.string = TRUE, seqtype = "DNA")

# 2. get a data frame with the transcript ID and the length of the sequence 
utr5_df <- as.data.frame(matrix(ncol = 2, nrow = length(utr5)))
colnames(utr5_df) <- c("transcript_id", "utr5_length")
for (i in 1:length(utr5)) {
  utr5_df[i,1] <- attributes(utr5)$names[i]
  utr5_df[i,2] <- nchar(utr5[[i]])
}

utr3_df <- as.data.frame(matrix(ncol = 2, nrow = length(utr3)))
colnames(utr3_df) <- c("transcript_id", "utr3_length")
for (i in 1:length(utr3)) {
  utr3_df[i,1] <- attributes(utr3)$names[i]
  utr3_df[i,2] <- nchar(utr3[[i]])
}

cds_df <- as.data.frame(matrix(ncol = 2, nrow = length(cds)))
colnames(cds_df) <- c("transcript_id", "cds_length")
for (i in 1:length(cds)) {
  cds_df[i,1] <- attributes(cds)$names[i]
  cds_df[i,2] <- nchar(cds[[i]])
}

# 3. some sequences are notified as "sequences unavailable"... taking out them

## the function return a vector of transcript ID that do not as "sequence unavailable" as sequence
## so the transcript to keep
clean <- function(list) {
  keep <- c()
  for (i in 1:length(list)) {
    if (!list[[i]] == "sequence unavailable") {
      keep <- c(keep, attributes(list)$names[i])
    }
  }
  return(keep)
}

utr3_clean <- utr3_df[utr3_df$transcript_id %in% clean(utr3), ]
utr5_clean <- utr5_df[utr5_df$transcript_id %in% clean(utr5), ]
cds_clean <- cds_df[cds_df$transcript_id %in% clean(cds), ]

# 4. Merging data frames

features_length <- merge(utr5_clean, cds_clean, by = "transcript_id", all = TRUE)
features_length <- merge(features_length, utr3_clean, by = "transcript_id", all = TRUE)
features_length <- features_length[!duplicated(features_length),] # no row duplicated

which(duplicated(features_length$transcript_id))

write.csv(features_length, file = "results/features_length_ensembl.csv", row.names = FALSE)
