##
## The following notebook reproduces the analysis of EMT scores in TCGA cancer types, including
## association with patient survival and treatment response,
## score distribution across cancer types and primary/metastatic tumors.
## It also includes an analysis of length distributions of hallmark sets.
## To run the following script ensure to download the TCGA datasets and MsigDB hallmarks set
## as indicated in the paper.   
##


library(ggplot2)
library(msigdb)
library(msigdbi)
library(fgsea)
library(data.table)
library(dplyr)
library(stringr)
library(ggpubr)
library(survival)
library(survminer)
library(broom)
library(dplyr)
library(ggpubr)
library(ggrepel)
library(ggplot2)

outputPath <- <outputPath>

#1. Plot the length distribution of hallmarks of cancer sets.
all = "msigdb.v2025.1.Hs.symbols.gmt"
all_set <- read.gmt(all)

lengths <- c()
for (hal in all_set$genesets){
  lengths <- c(lengths, length(all_set$genesets[hal]))
  
  
}

dd <- data.frame(lengths = lengths)
p <- ggplot(dd, aes(x = lengths)) + geom_histogram(binwidth=50, 
color="black", fill="white") +
  theme_classic() +
  labs(x = "Gene signature length", y = "Count") +
  theme(axis.title = element_text(size=14, color="black"),
        axis.text = element_text(size=14, color="black"),
        axis.text.x = element_text(angle = 60, hjust=1, vjust=1)) +
  scale_x_continuous(breaks = seq(0,2000,100)) +
  scale_y_continuous(breaks = seq(0,15000,1000))

png(filename =paste0(outputPath, "Histogram_signature_length.png"))
plot(p)
dev.off()

#2. Plot distribution of cancer cell- and hallmark-EMT signatures across cancer types and malignancy.
#Read scores calculated using "score_EMT_signature_TCGA.R" script:
results_scores <- read.csv(paste0(outputPath, "EMT_scores_loged_zscore.txt"), sep='\t')
results_scores$cancer <- gsub("TCGA_", "", results_scores$cancer)
results_scores$cancer <- gsub("_rsem_curated", "", results_scores$cancer)

ord <- data.frame(results_scores %>% group_by(cancer) %>% summarize(median(EMT)))

p <- ggplot(results_scores, aes(x=factor(cancer, levels = ord[order(ord$median, decreasing=TRUE),"cancer"]), y = EMT)) + 
  geom_boxplot(fill = "brown3")

pp <- p + theme_classic() + labs(x = "TCGA cancer", y = "EMT score") +
  theme(axis.text = element_text(size=16, color="black"), 
        axis.title = element_text(size=16, color="black"),
        axis.text.x = element_text(angle = 90, hjust=1, vjust=0))


svg(filename =paste0(outputPath, "EMT_across_cancers.svg"), 8, 5)
plot(pp)
dev.off()

p <- ggplot(results_scores, aes(x=factor(cancer, levels = ord[order(ord$median, decreasing=TRUE),"cancer"]), y = HalEMT)) + 
  geom_boxplot(fill = "brown3")

pp <- p + theme_classic() + labs(x = "TCGA cancer", y = "Hallmark EMT score") +
  theme(axis.text = element_text(size=16, color="black"), 
        axis.title = element_text(size=16, color="black"),
        axis.text.x = element_text(angle = 90, hjust=1, vjust=0))

svg(filename =paste0(outputPath, "HalEMT_across_cancers.svg"), 8, 5)
plot(pp)
dev.off()

#Plot the distribution across primary / metastatic samples:
cancers_metastatic <- unique(results_scores[(grepl("\\.06|\\.07", rownames(results_scores))),"cancer"])

tmp <- results_scores[results_scores$cancer %in%  cancers_metastatic,]
tmp$status <- "Unknown"
tmp[grepl("\\.01|\\.03|\\.09|\\.05", rownames(tmp)),"status"] <- "Primary"
tmp[grepl("\\.07|\\.06", rownames(tmp)),"status"] <- "Metastatic"

tmp <- tmp[tmp$status!= "Unknown",]
tmp$sample <- str_sub(rownames(tmp), 1, 12)

#Use only unpaired samples:
paired <- names(table(tmp$sample)[table(tmp$sample)>1])
paired_tmp <- tmp[tmp$sample %in% paired,]

tmp <- tmp[!tmp$sample %in% paired,]

#Enough unpaired samples are only in GBM, OV and SKCM:
tmp2 <- tmp[tmp$cancer %in% c("GBM", "OV", "SKCM"),]
p<-ggboxplot(tmp2, x = "cancer", y = "EMT", fill = "status") + theme_minimal() + theme(axis.title = element_text(size=16, color="black"),
                                   axis.text = element_text(size=16, color="black"),
                                  axis.text.x = element_text(angle=90, vjust=1, hjust=1),
                                  legend.text = element_text(size=16),
                                  legend.title= element_text(size=16),
                                  legend.position = "top") + 
  scale_fill_manual("Status", values=c("brown3", "lightblue4")) +
  labs(x = "TCGA cancer", y = "EMT score") + 
  stat_compare_means(aes(group=status), label="p.signif", size=6)
  

svg(filename =paste0(outputPath, "EMT_primary_metastatic_unpaired.svg"), 5, 5)
plot(p)
dev.off()

#Repeat the plot for hallmark EMT:
p<-ggboxplot(tmp2, x = "cancer", y = "HalEMT", fill = "status") + theme_minimal() + theme(axis.title = element_text(size=16, color="black"),
                                   axis.text = element_text(size=16, color="black"),
                                  axis.text.x = element_text(angle=90, vjust=1, hjust=1),
                                  legend.text = element_text(size=16),
                                  legend.title= element_text(size=16),
                                  legend.position = "top") + 
  scale_fill_manual("Status", values=c("brown3", "lightblue4")) +
  labs(x = "TCGA cancer", y = "Hallmark EMT score") + 
  stat_compare_means(aes(group=status), label="p.signif", size=6)

svg(filename =paste0(outputPath, "HalEMT_primary_metastatic_unpaired.svg"), 5, 5)
plot(p)
dev.off()


#Read in clinical data for survival analysis:
clinical = read.csv(pathClinical, header=TRUE, sep='\t', fill=TRUE, row.names=1)

clinical$bcr_patient_barcode <- gsub("-", "\\.", clinical$bcr_patient_barcode)
results_scores$sample <- str_sub(rownames(results_scores), 1, 12)
merged <- merge(results_scores, clinical, by.x = "sample", by.y = "bcr_patient_barcode", all.x=TRUE)

cancers <- unique(merged$cancer)

#Prepare variables for multivariate analysis:
merged2 <- merged[!is.na(merged$ajcc_pathologic_tumor_stage),]
merged2 <- merged2[!merged2$ajcc_pathologic_tumor_stage %in% c("Stage X", "[Not Available]", "[Not Applicable]", "[Discrepancy]", "[Unknown]","IS", "I/II NOS"),]
merged2$stage <- merged2$ajcc_pathologic_tumor_stage
merged2$stage <- gsub("Stage ", "", merged2$stage)
merged2[grepl("IV", merged2$stage),"stage"] <- 4
merged2[grepl("III", merged2$stage),"stage"] <- 3
merged2[grepl("II", merged2$stage),"stage"] <- 2
merged2[grepl("I", merged2$stage),"stage"] <- 1
merged2$stage <- as.numeric(merged2$stage)


#3. Calculate global association with survival:
merged2$OS.time <- as.numeric(merged2$OS.time)
merged2$OS <- as.numeric(merged2$OS)
merged2$age_at_initial_pathologic_diagnosis <- as.numeric(merged2$age_at_initial_pathologic_diagnosis)
merged2$EMT <- scale(merged2$EMT)
merged2$HalEMT <- scale(merged2$HalEMT)

res1 <- coxph(Surv(OS.time, OS) ~ age_at_initial_pathologic_diagnosis + stage + EMT + strata(cancer), data =  merged2)
res2 <- coxph(Surv(OS.time, OS) ~ age_at_initial_pathologic_diagnosis + stage + HalEMT + strata(cancer), data =  merged2)
res3 <- coxph(Surv(OS.time, OS) ~ age_at_initial_pathologic_diagnosis + stage + strata(cancer), data =  merged2)
res4 <- coxph(Surv(OS.time, OS) ~ age_at_initial_pathologic_diagnosis + stage + EMT + HalEMT + strata(cancer), data =  merged2)

summary_res2 <- summary(res2)

results_res2 <- data.frame(
  variable = rownames(summary_res2$coefficients),
  coef = summary_res2$coefficients[, "coef"],
  HR = summary_res2$coefficients[, "exp(coef)"],
  lower95 = summary_res2$conf.int[, "lower .95"],
  upper95 = summary_res2$conf.int[, "upper .95"],
  pval = summary_res2$coefficients[, "Pr(>|z|)"]
)

# Save to CSV
write.csv(results_res2, paste0(outputPath, "cox_results_HalEMT_age_stage.csv"), row.names = FALSE)

summary_res4 <- summary(res4)

results_res4 <- data.frame(
  variable = rownames(summary_res4$coefficients),
  coef = summary_res4$coefficients[, "coef"],
  HR = summary_res4$coefficients[, "exp(coef)"],
  lower95 = summary_res4$conf.int[, "lower .95"],
  upper95 = summary_res4$conf.int[, "upper .95"],
  pval = summary_res4$coefficients[, "Pr(>|z|)"]
)

# Save to CSV
write.csv(results_res4, paste0(outputPath, "cox_results_EMT_HalEMT_age_stage.csv"), row.names = FALSE)

#Plot:
results_res2 <- results_res2 %>%
  mutate(
    stars = case_when(
      pval < 0.001 ~ "***",
      pval < 0.01  ~ "**",
      pval < 0.05  ~ "*",
      TRUE ~ ""
    )
  )

results_res2$variable <- c("Age", "Stage", "Hallmark EMT score")

p <- ggplot(results_res2, aes(y = variable, x = HR, xmin = lower95, xmax = upper95)) +
  geom_pointrange() +
  geom_vline(xintercept = 1, linetype = "dashed", color = "red") +
  geom_text(aes(label = stars), 
            hjust = -1, size = 6) +
  xlab("Hazard Ratio (95% CI)") +
  ylab("") +
  theme_minimal() + theme(axis.title= element_text(size=14, color="black"),
                          axis.text = element_text(size=14, color="black")) +
  coord_cartesian(xlim = c(min(results_res2$lower95), max(results_res2$upper95) * 1.2))

svg(filename =paste0(outputPath, "coxph_stratified_age_stage_HalEMT.svg"), height=2.5, 5)
plot(p)
dev.off()

results_res4 <- results_res4 %>%
  mutate(
    stars = case_when(
      pval < 0.001 ~ "***",
      pval < 0.01  ~ "**",
      pval < 0.05  ~ "*",
      TRUE ~ ""
    )
  )

results_res4$variable <- c("Age", "Stage", "Cancer cell EMT score", "Hallmark EMT score")

p <- ggplot(results_res4, aes(y = variable, x = HR, xmin = lower95, xmax = upper95)) +
  geom_pointrange() +
  geom_vline(xintercept = 1, linetype = "dashed", color = "red") +
  geom_text(aes(label = stars), 
            hjust = -1, size = 6) +
  xlab("Hazard Ratio (95% CI)") +
  ylab("") +
  theme_minimal() + theme(axis.title= element_text(size=14, color="black"),
                          axis.text = element_text(size=14, color="black")) +
  coord_cartesian(xlim = c(min(results_res2$lower95), max(results_res2$upper95) * 1.2))

svg(filename =paste0(outputPath, "coxph_stratified_age_stage_EMT_HalEMT.svg"), height=2.5, 5)
plot(p)
dev.off()


summary_res1 <- summary(res1)

results_res1 <- data.frame(
  variable = rownames(summary_res1$coefficients),
  coef = summary_res1$coefficients[, "coef"],
  HR = summary_res1$coefficients[, "exp(coef)"],
  lower95 = summary_res1$conf.int[, "lower .95"],
  upper95 = summary_res1$conf.int[, "upper .95"],
  pval = summary_res1$coefficients[, "Pr(>|z|)"]
)


results_res1 <- results_res1 %>%
  mutate(
    stars = case_when(
      pval < 0.001 ~ "***",
      pval < 0.01  ~ "**",
      pval < 0.05  ~ "*",
      TRUE ~ ""
    )
  )

results_res1$variable <- c("Age", "Stage", "Cancer cell EMT score")

p <- ggplot(results_res1, aes(y = variable, x = HR, xmin = lower95, xmax = upper95)) +
  geom_pointrange() +
  geom_vline(xintercept = 1, linetype = "dashed", color = "red") +
  geom_text(aes(label = stars), 
            hjust = -1, size = 6) +
  xlab("Hazard Ratio (95% CI)") +
  ylab("") +
  theme_minimal() + theme(axis.title= element_text(size=14, color="black"),
                          axis.text = element_text(size=14, color="black")) +
  coord_cartesian(xlim = c(min(results_res2$lower95), max(results_res2$upper95) * 1.2))

svg(filename =paste0(outputPath, "coxph_stratified_age_stage_EMT.svg"), height = 2.5, 5)
plot(p)
dev.off()


#4. Evaluate link with treatment response:
# treatment information should be downloaded using TCGABiolinks packages
treatment_dir <- <directory_with_saved_treatment_information>
list_treatment <- list.files(treatment_dir, pattern="*_treatment.txt")
tr <- gsub("_treatment.txt", "", list_treatment)

# Data frames to store results: 
dadat_chemo <- data.frame(cancer = character(), pvalue = numeric(), cor = numeric())
all_chemo <- data.frame(sample=character(), pharmaceutical_therapy_type=character(), 
                        treatment_best_response=numeric(), EMT=numeric(), cancer = character())

#Results for chemotherapy:
for (can in tr){
  print(can)
  if (file.info(paste0(treatment_dir, can, "_treatment.txt"))$size>1){
    treat <- read.table(paste0(treatment_dir, can, "_treatment.txt"), header=TRUE, sep='\t', fill=TRUE)
    treat <- treat[4:nrow(treat),]
    if (sum(c("pharmaceutical_therapy_type", "treatment_best_response") %in% colnames(treat))==2){
      imu <- treat[treat$pharmaceutical_therapy_type == "Chemotherapy",]
      imu <- unique(imu)
      imu <- imu[!grepl("\\[", imu$treatment_best_response),]
      score <- results_scores[results_scores$cancer == can, "EMT", drop=FALSE]
      score$sample <- results_scores[results_scores$cancer == can, "sample"]
      toRem <- score$sample[duplicated(score$sample)]
      if (length(toRem)>0){
        score <- score[!rownames(score) %in% toRem,]}
      imu$sample <- gsub("-", "\\.", imu[,1])
      comS <- intersect(imu$sample, score$sample)
      imu <- imu[imu$sample %in% comS,c("pharmaceutical_therapy_type", "treatment_best_response","sample")]      
      imu <- merge(imu, score, by="sample")
      print(unique(imu$treatment_best_response))
      
      if (length(unique(imu$treatment_best_response))>1){
        imu[imu$treatment_best_response == "Complete Response","response"] <- 1
        imu[imu$treatment_best_response == "Partial Response","response"] <- 2
        imu[imu$treatment_best_response == "Stable Disease","response"] <- 3
        imu[imu$treatment_best_response == "Clinical Progressive Disease","response"] <- 4
        
        #test <- (kruskal.test(sitth~treatment_best_response,data=imu))
        test <- cor.test(imu$EMT, imu$response, method="spearman")
        #comparisons <- combn(unique(imu$treatment_best_response), 2, simplify = FALSE)
        te <- data.frame(cancer = can, pvalue = test$p.value, cor = test$estimate)
        dadat_chemo <- rbind(dadat_chemo, te)
        imu2 <- imu
        imu2$cancer <- can
        all_chemo <- rbind(all_chemo, imu2)
      }
    }
  }
}


#Results for immunotherapy:
dadat_immuno <- data.frame(cancer = character(), pvalue = numeric(), cor = numeric())
all_imu <- data.frame(sample=character(), pharmaceutical_therapy_type=character(), treatment_best_response=numeric(), EMT=numeric(),
                      cancer = character())

for (can in tr){
  print(can)
  #can <- tr[1]
  if (file.info(paste0(treatment_dir, can, "_treatment.txt"))$size>1){
    treat <- read.table(paste0(treatment_dir, can, "_treatment.txt"), header=TRUE, sep='\t', fill=TRUE)
    treat <- treat[4:nrow(treat),]
    if (sum(c("pharmaceutical_therapy_type", "treatment_best_response") %in% colnames(treat))==2){
      imu <- treat[treat$pharmaceutical_therapy_type == "Immunotherapy",]
      imu <- unique(imu)
      imu <- imu[!grepl("\\[", imu$treatment_best_response),]
      score <- results_scores[results_scores$cancer == can, "EMT", drop=FALSE]
      score$sample <- results_scores[results_scores$cancer == can, "sample"]
      toRem <- score$sample[duplicated(score$sample)]
      if (length(toRem)>0){
        score <- score[!rownames(score) %in% toRem,]}
      imu$sample <- gsub("-", "\\.", imu[,1])
      comS <- intersect(imu$sample, score$sample)
      imu <- imu[imu$sample %in% comS,c("pharmaceutical_therapy_type", "treatment_best_response","sample")]
      imu <- merge(imu, score, by="sample")
      print(unique(imu$treatment_best_response))
      
      if (length(unique(imu$treatment_best_response))>1){
        imu[imu$treatment_best_response == "Complete Response","response"] <- 1
        imu[imu$treatment_best_response == "Partial Response","response"] <- 2
        imu[imu$treatment_best_response == "Stable Disease","response"] <- 3
        imu[imu$treatment_best_response == "Clinical Progressive Disease","response"] <- 4
        
        test <- cor.test(imu$EMT, imu$response, method="spearman")

        te <- data.frame(cancer = can, pvalue = test$p.value, cor = test$estimate)
        dadat_immuno <- rbind(dadat_immuno, te)
        imu2 <- imu
        imu2$cancer <- can
        all_imu <- rbind(all_imu, imu2)
      }
    }
  }
}

#Plot:
dadat_chemo$type <- "Chemotherapy"
dadat_immuno$type <- "Immunotherapy"

dadat <- rbind(dadat_chemo, dadat_immuno)

p <- ggplot(dadat, aes(x = cancer, y = factor(type), fill = cor)) +
  geom_tile(color = "white",
            lwd = 0.5,
            linetype = 1) +
  scale_fill_gradient2("Spearman's Rho", low = "steelblue", high = "red3", mid="white",
                      limits = c(-1, 1),
                      breaks = c(-1, -0.5, 0, 0.5, 1),
                      labels = c(-1, -0.5, 0, 0.5, 1)) +
  theme_minimal() + theme(legend.key.width= unit(1.5, 'cm'), axis.title = element_blank(), legend.title = element_text(size=16), legend.text = element_text(size=16), legend.position="top",
                          axis.text = element_text(size=16, color="black"),
                          axis.text.x = element_text(angle=90),
                          plot.margin = unit(c(1,1,1,1), "lines"))


svg(filename =paste0(outputPath, "EMT_treatment_response_TCGA.svg"), width=7, height=3)
plot(p)
dev.off()


#5. Compare correlation with purity and EMT scores:

purityFile <- "TCGA_mastercalls.abs_tables_JSedit.fixed.txt"
purity <- read.csv(purityFile, sep='\t')
purity$array <- gsub("-", "\\.", purity$array)
purity <- purity[!duplicated(purity$array),]

results_scores$samples <- rownames(results_scores)
merged <- merge(results_scores, purity, by.x = "samples", by.y = "array")

ours <- c()
halEMT <- c()
cancer <- c()

for (can in unique(merged$cancer)){
  tmp <- merged[merged$cancer == can,]
  ours <- c(ours, cor.test(tmp$EMT, tmp$purity, method="spearman")$estimate)
  halEMT <- c(halEMT, cor.test(tmp$HalEMT, tmp$purity, method="spearman")$estimate)
  cancer <- c(cancer, can)
  
}

dd <- data.frame(cancer = cancer, EMT = ours, HalEMT = halEMT)
dd2 <- reshape::melt(dd)
pp <- ggplot(dd2, aes(x = cancer, y =value, color = variable )) + geom_point(size=4) + 
  theme_classic() + theme(axis.text.x = element_text(angle = 90, size=14), axis.text.y = element_text(size=14),
                          axis.title = element_text(size=14, color="black"))

dd2 <- reshape::melt(dd)

#Plot:
pp <- ggplot(dd2, aes(x = variable, y = value, color = variable, group = cancer)) +
  geom_point(size = 1) +
  geom_line(color = "gray") +
  geom_text_repel(data = dd2[dd2$cancer %in% c("THYM", "LAML", "DLBC", dd[dd$EMT < dd$HalEMT,"cancer"]),], aes(label = cancer), size = 3, max.overlaps=23) +  # labels automatically repel each other
  #geom_text_repel(aes(label = cancer), size = 3, max.overlaps=23) +  # labels automatically repel each other
  theme_classic() +
  theme(axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14),
        axis.title = element_text(size = 14, color = "black"),
        legend.position = "none") +
  labs(x = "Score", y = "Spearman's Rho for correlation with tumor purity") +
  stat_compare_means(
    comparisons = list(c("HalEMT", "EMT")),
    method = "wilcox.test"
  )

svg(filename =paste0(outputPath, "EMT_purity.svg"), 5, 5) #looks so so 
plot(pp)
dev.off()


#> sessionInfo()
#R version 4.1.2 (2021-11-01)
#Platform: x86_64-pc-linux-gnu (64-bit)
#Running under: Ubuntu 22.04.5 LTS
#
#Matrix products: default
#BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
#LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
#
#locale:
# [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
# [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
# [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
#[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
#
#attached base packages:
#[1] stats     graphics  grDevices utils     datasets  methods   base     
#
#other attached packages:
# [1] ggrepel_0.9.3     broom_1.0.4       survminer_0.4.9   survival_3.5-7   
# [5] ggpubr_0.4.0      stringr_1.4.0     dplyr_1.1.4       data.table_1.14.8
# [9] fgsea_1.20.0      msigdbi_1.0.0     msigdb_1.2.0      ggplot2_4.0.0    
#
#loaded via a namespace (and not attached):
# [1] zoo_1.8-10          tidyselect_1.2.1    xfun_0.40          
# [4] purrr_1.0.2         splines_4.1.2       lattice_0.21-8     
# [7] carData_3.0-5       vctrs_0.6.5         generics_0.1.3     
#[10] utf8_1.2.3          survMisc_0.5.6      rlang_1.1.1        
#[13] pillar_1.9.0        glue_1.8.0          withr_3.0.2        
#[16] BiocParallel_1.28.3 RColorBrewer_1.1-3  S7_0.2.0           
#[19] lifecycle_1.0.4     ggsignif_0.6.4      gtable_0.3.6       
#[22] knitr_1.43          parallel_4.1.2      fansi_1.0.4        
#[25] Rcpp_1.0.11         xtable_1.8-4        scales_1.4.0       
#[28] backports_1.4.1     abind_1.4-5         farver_2.1.2       
#[31] km.ci_0.5-6         gridExtra_2.3       fastmatch_1.1-4    
#[34] stringi_1.7.12      rstatix_0.7.2       KMsurv_0.1-5       
#[37] grid_4.1.2          cli_3.6.5           tools_4.1.2        
#[40] magrittr_2.0.3      tibble_3.2.1        dichromat_2.0-0.1  
#[43] tidyr_1.3.1         car_3.1-2           pkgconfig_2.0.3    
#[46] Matrix_1.5-4        R6_2.6.1            compiler_4.1.2 




