#!/usr/bin/Rscript

# AUTHOR:	Eirini Petsalaki
# Merges gene expression from two breast cancer datasets and uses biomart to only show protein coding genes 
# 
# INPUT: 
#          E-MTAB-results,          FPKM expression values accross all genes, from Expression Atlas .  	
#
# OUTPUT:   
#	                   
#	   GeneXData.csv		        filtered FPKM expression values for use in wgcna.R
#

library(readr)
path<-"~/phenotype_networks/data/expression"
setwd(path)
# http://www.ebi.ac.uk/gxa/experiments/E-MTAB-2706/Downloads?specific=true&geneQuery=%255B%255D&filterFactors=%257B%2522CELL_LINE%2522%253A%255B%2522HCC70%2522%252C%2522SUM%2520159PT%2522%252C%2522SUM%2520149PT%2522%255D%257D&cutoff=%257B%2522value%2522%253A0.5%257D
E_MTAB_2706_query_results <- read_delim("E-MTAB-2706-query-results.tsv", "\t", escape_double = FALSE, trim_ws = TRUE, skip = 4)
# 57073 obs. of  624 variables

# http://www.ebi.ac.uk/gxa/experiments/E-MTAB-2770/Downloads?specific=true&geneQuery=%255B%255D&filterFactors=%257B%2522CELL_LINE%2522%253A%255B%2522BT-474%2522%252C%2522CAMA-1%2522%252C%2522HCC1143%2522%252C%2522HCC1954%2522%252C%2522Hs%2520578T%2522%252C%2522JIMT-1%2522%252C%2522MCF7%2522%252C%2522MDA-MB-157%2522%252C%2522MDA-MB-231%2522%252C%2522SK-BR-3%2522%252C%2522T-47D%2522%252C%2522ZR-75-1%2522%255D%257D&cutoff=%257B%2522value%2522%253A0.5%257D
E_MTAB_2770_query_results <- read_delim("E-MTAB-2770-query-results.tsv", "\t", escape_double = FALSE, trim_ws = TRUE, skip = 4)
# 57073 obs. of  936 variables

ALL_E_MTAB <- merge(E_MTAB_2706_query_results,E_MTAB_2770_query_results, by = "Gene ID")
# 57073 obs. of  1559 variables (only CAMA1 was common)

# SELECTED BREAST CANCER CELL LINES:
BT474 <- ALL_E_MTAB$`BT-474, invasive ductal carcinoma`
CAMA1 <- ALL_E_MTAB$`CAMA-1, breast adenocarcinoma`
T47D <- ALL_E_MTAB$`T-47D, invasive ductal carcinoma`
ZR75.1 <- ALL_E_MTAB$`ZR-75-1, invasive ductal carcinoma`
# MCDMB453 missing
SKBR3 <- ALL_E_MTAB$`SK-BR-3, breast adenocarcinoma`
MCF7 <- ALL_E_MTAB$`MCF7, invasive ductal carcinoma`
HCC1143 <- ALL_E_MTAB$`HCC1143, breast, breast ductal adenocarcinoma`
HCC1954 <- ALL_E_MTAB$`HCC1954, breast ductal adenocarcinoma`
HCC70 <- ALL_E_MTAB$`HCC70, breast, breast ductal adenocarcinoma`
hs578T <- ALL_E_MTAB$`Hs 578T, invasive ductal carcinoma`
JIMT1 <- ALL_E_MTAB$`JIMT-1, breast carcinoma`
MCF10A <- ALL_E_MTAB$`MCF 10A, breast, breast fibrocystic disease`
# MCF12A missing
MDAMB157 <- ALL_E_MTAB$`MDA-MB-157, breast carcinoma`
MDAMB231 <- ALL_E_MTAB$`MDA-MB-231, breast adenocarcinoma`
# SUM149 missing
# SUM159 missing

ALLGeneXData <- data.frame(ALL_E_MTAB$`Gene ID`,ALL_E_MTAB$`Gene Name.x`,
                        ALL_E_MTAB$`BT-474, invasive ductal carcinoma`,
                        ALL_E_MTAB$`CAMA-1, breast adenocarcinoma`,
                        ALL_E_MTAB$`T-47D, invasive ductal carcinoma`,
                        ALL_E_MTAB$`ZR-75-1, invasive ductal carcinoma`,
                        ALL_E_MTAB$`SK-BR-3, breast adenocarcinoma`,
                        ALL_E_MTAB$`MCF7, invasive ductal carcinoma`, 
                        ALL_E_MTAB$`HCC1143, breast, breast ductal adenocarcinoma`, 
                        ALL_E_MTAB$`HCC1954, breast ductal adenocarcinoma`,
                        ALL_E_MTAB$`HCC70, breast, breast ductal adenocarcinoma`,
                        ALL_E_MTAB$`Hs 578T, invasive ductal carcinoma`, 
                        ALL_E_MTAB$`JIMT-1, breast carcinoma`,
                        ALL_E_MTAB$`MCF 10A, breast, breast fibrocystic disease`,
                        # MCF12A missing
                        ALL_E_MTAB$`MDA-MB-157, breast carcinoma`,
                        ALL_E_MTAB$`MDA-MB-231, breast adenocarcinoma`)
                        # SUM149 missing
                        # SUM159 missing
                        
colnames(ALLGeneXData) <- c("GeneID","GeneName","BT474","CAMA1","T47D","ZR751","SKBR3","MCF7","HCC1143","HCC1954","HCC70","hs578T","JIMT1","MCF10A","MDAMB157","MDAMB231")

GeneXData <- ALLGeneXData[rowSums(ALLGeneXData[3:16])> 1, ]
write.csv(GeneXData, file = "GeneXDataNofilt.csv")

# Data contains many genes that are not protein encoding. I extract the gene ids and input them into
# ensembl biomart: Dataset: Ensembl Genes 89, Human genes (GRCh38.p10)
#                  Filters: None Selected
#                  Attributes: Gene stable ID, Gene type
# output saved in mart_export.csv

#### GeneXData <- read_csv("~/GeneXData.csv",col_types = cols(X1 = col_skip()))
mart_export <- read_csv("../geneID-type.csv")
colnames(mart_export) <- c("GeneID","GeneType")

GeneXDType <- merge(GeneXData,mart_export, by = "GeneID")

GXDatafilt <- subset(GeneXDType, GeneType == "protein_coding")
# 15304 obs of 17 variables
write.csv(GXDatafilt, file = "GeneXData.csv")
# TO LOAD: read_csv("~/GeneXData.csv",col_types = cols(X1 = col_skip()))



