library(DESeq2)
library(tximport)
library(tximportData)
library(readr)
library(stringr)
library(dplyr)
library(readxl)
library(rhdf5)
library(tidyr)
library(devtools)
library(Rsubread)
library(purrr)

options(bedtools.path = "~/gersbachlab/cmk90/") #setting path to bedtools
#devtools::install_github("PhanstielLab/bedtoolsr") #have to run this after the options command

#testthat::test_package("bedtoolsr") #this is to check stuff is installed properly



# Create input DataFrame. Column 1 is the ENCODE experiment accession number. Column 2 is the BAM and column 3 is the SAF. 

#This is for single-end reads only. 

input_df <- data.frame(
  Experiment = c("ENCSR747SEU", "ENCSR238FMP", "ENCSR299QGI", "ENCSR298OIK", "ENCSR517NHP",
                 "ENCSR272RQX", "ENCSR622CAH", "ENCSR514QHN", "ENCSR000EOO", "ENCSR000EQA",
                 "ENCSR628NEA", "ENCSR933GMM", "ENCSR909HFI", "ENCSR608NWP"),
  BAM = c("ENCFF041ZOJ", "ENCFF366VIM", "ENCFF631HMZ", "ENCFF287TWO", "ENCFF603PQU",
          "ENCFF622FLC", "ENCFF056MLO", "ENCFF320RNH", "ENCFF449PKZ", "ENCFF012DKI",
          "ENCFF397QAV", "ENCFF427BNB", "ENCFF520UZZ", "ENCFF920IXO"),
  BED = c("ENCFF591LZL", "ENCFF979JED", "ENCFF221GCG", "ENCFF521JOO", "ENCFF997ZRY",
          "ENCFF593ULX", "ENCFF570VFR", "ENCFF011FVM", "ENCFF352KQN", "ENCFF790RHQ",
          "ENCFF096WOZ", "ENCFF912XTH", "ENCFF905THS", "ENCFF030GBB"),
  stringsAsFactors = FALSE
)

# Define function to perform read counting using featureCounts. #This is for single-end reads only.

single_read_rows_to_iterate <- c(1:4, 9, 11:14)

single_perform_read_counting <- function(input_df, output_dir) {
  # Iterate over each row in the input DataFrame
  for (i in 14:nrow(input_df)) { 
    # Get input file paths from the DataFrame
    bam_file <- paste0("bam_files/", input_df[i, "BAM"], ".bam")
    saf_file <- paste0("saf_files/", input_df[i, "BED"], "_annot.saf")
    
    # Construct output file name based on the first column of the input DataFrame
    output_file <- paste0(output_dir, "/", input_df[i, "Experiment"], "_counts.txt")
    
    # Run featureCounts
    result <- featureCounts(bam_file, #bam files

    # annotation
    annot.inbuilt = "hg38",
    annot.ext = saf_file, #need to put SAF file here
    isGTFAnnotationFile = FALSE,
    chrAliases = NULL,
    
    # level of summarization
    useMetaFeatures = FALSE, #this will be false for DNase and ATAC-seq but maybe true for RNA-seq
    
    # overlap between reads and features
    allowMultiOverlap = TRUE, #option will count features that appear in multiple BED regions.
    minOverlap = 1, #may want to increase this. It's the number of bp in the read required to overlap the BED region to be considered a read for that region
    fracOverlap = 0,
    fracOverlapFeature = 0,
    largestOverlap = FALSE, #if true, a read (or read pair) will be assigned to the feature (or meta-feature) that has the largest number of overlapping bases. Not sure if we want this or not
    nonOverlap = NULL,
    nonOverlapFeature = NULL,

    # Read shift, extension and reduction
    #for now, I'm assuming the ENCODE pipeline has already done this for me. 
    readShiftType = "upstream",
    readShiftSize = 0,
    readExtension5 = 0,
    readExtension3 = 0,
    read2pos = NULL,
    
    # multi-mapping reads
    countMultiMappingReads = TRUE,

    # fractional counting
    fraction = FALSE,

    # long reads
    isLongRead = FALSE,

    # read filtering
    minMQS = 0,
    splitOnly = FALSE,
    nonSplitOnly = FALSE,
    primaryOnly = FALSE,
    ignoreDup = FALSE,
    
    # strandness
    strandSpecific = 0,
    
    # exon-exon junctions
    juncCounts = FALSE,
    genome = NULL,
    
    # parameters specific to paired end reads, these are ignored if isPairedEnd = FALSE
    isPairedEnd = FALSE,
    countReadPairs = TRUE,
    requireBothEndsMapped = FALSE,
    checkFragLength = FALSE,
    minFragLength = 50,
    maxFragLength = 600,
    countChimericFragments = TRUE,    
    autosort = TRUE,
    
    # number of CPU threads
    nthreads = 1,

    # read group
    byReadGroup = FALSE,

    # report assignment result for each read
    reportReads = NULL,
    reportReadsPath = NULL,

    # miscellaneous
    maxMOp = 10, #no idea what this means
    tmpDir = ".",
    verbose = FALSE)

  # Extract counts from the result
    counts <- result$counts

  # Write counts to file
    write.table(counts, output_file, sep = "\t", quote = FALSE, col.names = NA)
    
    # Print status message
    cat("Read counting completed for", input_df[i, "Experiment"], "\n")
  }
}


# Define output directory
output_dir <- "feature_counts_output"

# Perform read counting
single_perform_read_counting(input_df, output_dir)


# Create input DataFrame. Column 1 is the ENCODE experiment accession number. Column 2 is the BAM and column 3 is the SAF. 

# Define function to perform read counting using featureCounts
paired_read_rows_to_iterate <- c(1:4, 9, 11:14)

paired_perform_read_counting <- function(input_df, output_dir) {
  # Iterate over each row in the input DataFrame
  for (i in paired_read_rows_to_iterate) { 
    # Get input file paths from the DataFrame
    bam_file <- paste0("bam_files/", input_df[i, "BAM"], ".bam")
    saf_file <- paste0("saf_files/", input_df[i, "BED"], "_annot.saf")
    
    # Construct output file name based on the first column of the input DataFrame
    output_file <- paste0(output_dir, "/", input_df[i, "Experiment"], "_counts.txt")
    
    # Run featureCounts
    result <- featureCounts(bam_file, #bam files

    # annotation
    annot.inbuilt = "hg38",
    annot.ext = saf_file, #need to put SAF file here
    isGTFAnnotationFile = FALSE,
    chrAliases = NULL,
    
    # level of summarization
    useMetaFeatures = FALSE, #this will be false for DNase and ATAC-seq but maybe true for RNA-seq
    
    # overlap between reads and features
    allowMultiOverlap = TRUE, #option will count features that appear in multiple BED regions.
    minOverlap = 1, #may want to increase this. It's the number of bp in the read required to overlap the BED region to be considered a read for that region
    fracOverlap = 0,
    fracOverlapFeature = 0,
    largestOverlap = FALSE, #if true, a read (or read pair) will be assigned to the feature (or meta-feature) that has the largest number of overlapping bases. Not sure if we want this or not
    nonOverlap = NULL,
    nonOverlapFeature = NULL,

    # Read shift, extension and reduction
    #for now, I'm assuming the ENCODE pipeline has already done this for me. 
    readShiftType = "upstream",
    readShiftSize = 0,
    readExtension5 = 0,
    readExtension3 = 0,
    read2pos = NULL,
    
    # multi-mapping reads
    countMultiMappingReads = TRUE,

    # fractional counting
    fraction = FALSE,

    # long reads
    isLongRead = FALSE,

    # read filtering
    minMQS = 0,
    splitOnly = FALSE,
    nonSplitOnly = FALSE,
    primaryOnly = FALSE,
    ignoreDup = FALSE,
    
    # strandness
    strandSpecific = 0,
    
    # exon-exon junctions
    juncCounts = FALSE,
    genome = NULL,
    
    # parameters specific to paired end reads, these are ignored if isPairedEnd = FALSE
    isPairedEnd = TRUE,
    countReadPairs = TRUE,
    requireBothEndsMapped = FALSE,
    checkFragLength = FALSE,
    minFragLength = 50, #not sure about these
    maxFragLength = 600, #not sure about these
    countChimericFragments = TRUE,    
    autosort = TRUE,
    
    # number of CPU threads
    nthreads = 1,

    # read group
    byReadGroup = FALSE,

    # report assignment result for each read
    reportReads = NULL,
    reportReadsPath = NULL,

    # miscellaneous
    maxMOp = 10, #no idea what this means
    tmpDir = ".",
    verbose = FALSE)

  # Extract counts from the result
    counts <- result$counts

  # Write counts to file
    write.table(counts, output_file, sep = "\t", quote = FALSE, col.names = NA)
    
    # Print status message
    cat("Read counting completed for", input_df[i, "Experiment"], "\n")
  }
}


# Define output directory
output_dir <- "feature_counts_output"

# Perform read counting
paired_perform_read_counting(input_df, output_dir)



# Define the path to your folder containing the .txt files
folder_path <- "feature_counts_output/"

# Get a list of all .txt files in the folder
file_list <- list.files(path = folder_path, pattern = "\\.txt$", full.names = TRUE)

# Function to read a file and remove the first line
read_and_remove_first_line <- function(file) {
  # Read the file into a dataframe
  df <- read.table(file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
  
  # Remove the first row
  df <- df[-1, ]
  
  return(df)
}

# Read all files and remove the first line from each
dataframes <- lapply(file_list, read_and_remove_first_line)

# Perform a left join on all dataframes
result <- Reduce(function(x, y) left_join(x, y, by = "X"), dataframes)

#making the rownames from the first column and then removing the first column
rownames(result) <- result[[1]]
result <- result[ , -1]

# Define the desired order of the columns
desired_order <- c("ENCFF041ZOJ.bam", "ENCFF366VIM.bam","ENCFF631HMZ.bam","ENCFF287TWO.bam","ENCFF603PQU.bam","ENCFF622FLC.bam","ENCFF056MLO.bam","ENCFF320RNH.bam","ENCFF449PKZ.bam","ENCFF012DKI.bam","ENCFF397QAV.bam","ENCFF427BNB.bam","ENCFF520UZZ.bam","ENCFF920IXO.bam")

# Reorder the columns of the dataframe
result <- result[ , desired_order]

#creating column data file
coldata <- data.frame(matrix("", nrow = 14, ncol = 1))
colnames(coldata) <-c("tissue")
rownames(coldata) <- colnames(result)
coldata[[1]] <- c("heart","heart","heart","heart","skeletal","skeletal","skeletal","skeletal","skeletal","skeletal","liver","liver","liver","liver")

#checking set up of file
all(rownames(coldata) == colnames(result))

#Now running DESeq2 for Heart enriched over liver

heart_v_liver_coldata <- coldata %>% slice(c(1:4, 11:14))
heart_v_liver_result <- result %>% select(1:4, 11:14)

dds_heart_v_liver <- DESeqDataSetFromMatrix(countData = heart_v_liver_result,
                              colData = heart_v_liver_coldata,
                              design = ~ tissue)
dds_heart_v_liver

#setting "control" level
dds_heart_v_liver$tissue <- relevel(dds_heart_v_liver$tissue, ref = "liver")

#running DESeq
dds_heart_v_liver <- DESeq(dds_heart_v_liver)
res <- results(dds_heart_v_liver)
summary(res)
sum(res$padj < 0.05, na.rm=TRUE)

#We can order our results table by the smallest p value:
resOrdered <- res[order(res$pvalue),]

#MA Plot
plotMA(res)

#exporting as csv
write.csv(as.data.frame(resOrdered), 
          file="heart_vs_liver_DESeq2.csv")

#Now running DESeq2 for Heart enriched over skeletal muscle

heart_v_skeletal_coldata <- coldata %>% slice(c(1:10))
heart_v_skeletal_result <- result %>% select(1:10)

dds_heart_v_skeletal <- DESeqDataSetFromMatrix(countData = heart_v_skeletal_result,
                              colData = heart_v_skeletal_coldata,
                              design = ~ tissue)
dds_heart_v_skeletal

#setting "control" level
dds_heart_v_skeletal$tissue <- relevel(dds_heart_v_skeletal$tissue, ref = "skeletal")

#running DESeq
dds_heart_v_skeletal <- DESeq(dds_heart_v_skeletal)
res_hvs <- results(dds_heart_v_skeletal)
summary(res_hvs)
sum(res_hvs$padj < 0.05, na.rm=TRUE)

#We can order our results table by the smallest p value:
resOrdered_hvs <- res_hvs[order(res_hvs$pvalue),]

#MA Plot
plotMA(res_hvs)

#exporting as csv
write.csv(as.data.frame(resOrdered_hvs), 
          file="heart_vs_skmuscle_DESeq2.csv")
