#!/usr/bin/Rscript

# AUTHOR: Charlie Barker 
#
# Uses DESEQ library to do differential expression analysis on expression data on cell shape data. Please note : 
# line 149 (cluster.of.interest <- "TWO") allows you to select which cell shape group you are calculating differential 
# expression from. The variable "col.data" gives an idea of how this relates with named cell lines.
#
# INPUT: 
#          MTAB_COUNTS.csv,         expression data. If not present in the directory specified by the "path" variable. 
#                                   it will download it for you.
#          pk_subtypes.csv          csv describing the prior knowledge for cell line subtypes 
#          shape_clusters.csv       csv describing our groups identified from cell morphology along 
#
# OUTPUT:   
#	 
#   hetero_DEGs.txt             file describing significantly differntially expressed genes . 


library(gplots)
library(org.Hs.eg.db)
require(plyr)
library(plotly)
library(DESeq2)
library(readr)

suppressMessages( library( ExpressionAtlas ) )

path<-"/home/charlie/phenotype_networks/data/expression"
setwd(path)

##GET DATA 

#turns out you can t use FPKM with edge r, so im getting the actual interger count rather than relying on Eirini's preprocessing and filtering. 
count.file<-"MTAB_COUNTS.csv"
if (file.exists(count.file)) {
  all.mtab<-read.csv(count.file)
} else {
  datasets<-c("E-MTAB-2706", "E-MTAB-2770")
  allExps <- getAtlasData(datasets)
  E.MTAB.2706<-allExps[1]$`E-MTAB-2706`$rnaseq
  E.MTAB.2770<-allExps[2]$`E-MTAB-2770`$rnaseq
  counts.2706<-data.frame(assays(E.MTAB.2706)$counts)
  counts.2770<-data.frame(assays(E.MTAB.2770)$counts)
  
  counts.2770$ID <- rownames(counts.2770)
  counts.2706$ID <- rownames(counts.2706)
  
  colnames(counts.2706)<-E.MTAB.2706$cell_line
  colnames(counts.2770)<-E.MTAB.2770$cell_line
  #merge 
  all.mtab<-data.frame(cbind.data.frame(counts.2706,counts.2770))
  #write.csv(all.mtab, file = "./MTAB_COUNTS.csv")
}

## SELECTED BREAST CANCER CELL LINES:
rec_cellLines<-read_csv("~/cell_shapes/cell_line_diffexp/cell_line_seq.csv")
rec_cellLines2<-read_csv("~/cell_shapes/cell_line_diffexp/cell_line_seq2.csv")
breast_lines<-rec_cellLines[rec_cellLines$`Tissue Supergroup` == "Breast",]
breast_lines<-c(gsub("-",".",c(breast_lines$`Cell line`)),
                gsub("-", ".", rec_cellLines2[rec_cellLines2$`Site Primary` == "breast",]$`Cell line primary name`))
breast_lines<-gsub(" ", ".", breast_lines)
count.data<-all.mtab[,colnames(all.mtab) %in% breast_lines]
row.names(count.data)<-all.mtab$X
count.data$GeneID<-all.mtab$X
# Data contains many genes that are not protein encoding. I extract the gene ids and input them into
# ensembl biomart: Dataset: Ensembl Genes 89, Human genes (GRCh38.p10)
#                  Filters: None Selected
#                  Attributes: Gene stable ID, Gene type
# output saved in mart_export.csv

#### GeneXData <- read_csv("~/GeneXData.csv",col_types = cols(X1 = col_skip()))
library(readr)
mart_export <- read_csv("../geneID-type.csv")
colnames(mart_export) <- c("GeneID","GeneType")
row.names(mart_export) <- mart_export$GeneID
GeneXDType <- merge(count.data,mart_export, by = "GeneID")

count.data <- subset(GeneXDType, GeneType == "protein_coding")



##PREP RNASEQ DATA

#my analysis 
#seq.data <- read.csv("../data/GeneXDataNorm.csv", stringsAsFactors = FALSE)
#row.names(seq.data)<-seq.data$GeneID
#seq.data$X<-NULL
#get counts per million 
row.names(count.data)<-count.data$GeneID
ID.DF<-data.frame(count.data$GeneID,count.data$GeneType)
count.data$GeneID<-NULL
count.data$GeneType<-NULL
library(edgeR)
count.data.cpm<-cpm(count.data)
#id genes with at least 0.5 cpm in at least 4 samples and keep them. 
thresh <- count.data.cpm > 0.5 #5  
keep <- rowSums(thresh) >= 8 #2 #if the number of counts hhas to be over 0.5 for at least x cell lines for a gene to be kept.
dim(count.data)
dim(count.data[keep,])
count.data <- count.data[keep,] 

#removes a total of 45970 genes 
#convert to edger object 

#count data is now good enough to use in DESEQ2

##DESIGN MATRIX 

#obtain sample information from PMID: 29158785 and Eirini's stuff 
col.data<-data.frame(row.names = colnames(count.data))
# Register the number of cores to use
library(BiocParallel)
register(MulticoreParam(4))
for (cell in rownames(col.data)) {
  a <-as.numeric(rownames(col.data) %in% cell)
  col.data$a <- factor(a)
  deseq2Data <- DESeqDataSetFromMatrix(countData=count.data, colData=col.data, design= ~a)
  deseq2Data <- DESeq(deseq2Data,parallel = TRUE)
  deseq2Results <- results(deseq2Data, contrast=c("a", 1, 0))
  deseq2ResDF <- as.data.frame(deseq2Results)
  file.name<-paste(gsub("\\.","_", cell), ".csv", sep = "")
  write.csv(deseq2ResDF, paste("~/cell_shapes/test/", file.name, sep = ""))
}


