# Uses the kinase inhibitor data base to study the relationship between drug network properties and the size of the effect 
# 
# AUTHOR:	C.Barker
# INPUT: 
#          kinase_inhibition_data,                       Directory containing morphological data paired with drug peturbation  
#                                                        availability: https://lincs.hms.harvard.edu/mills-unpubl-2015/
#          csn_semsim.txt,                               regulatory network to be tested, with semantic similarity as edgeweights 
#          dataset_20000_20200713081900.csv              Database with kinase inhibitor specificity 
#                                                        availability: https://lincs.hms.harvard.edu/db/datasets/20000/
#
# OUTPUT:   plots including figure 4, 5 and UMAP for the features tested. 

library(factoextra)
library(tidyverse)
library(stringr)
library(ggplot2)
library(ggridges)
library(reshape2)

szymkiewicz_simpson <- #overlap coeffcient (https://en.wikipedia.org/wiki/Overlap_coefficient)
  function(set1, set2){
    return (length(intersect(set1, set2)) / min(length(set1),length(set2)))
  }
jaccard <- #jaccard index
  function(set1, set2){
    return (length(intersect(set1, set2)) / length(union(set1,set2)))
  }
normalize <- function(x) { #min max norm 
  return ((x - min(x)) / (max(x) - min(x)))
}
nothing <- function(x) {
  return (x)
}
####PARAMS####

cell.line<-"HS578T"  ##MCF7 or SKBR3 or MCF10A or HS578T  
property<-"_MOR_" #set which property you would like to study (_TXT_ for texture and _MOR_ for morphology)
nuc.cyt<-"N" #if texture, set whether you want to observe the nucleic texture or the cytoplasmic texture 

##################MAIN##################

####network####
setwd("~/cell_shapes/kinase_inhibition_data")
net <- read.delim("~/phenotype_networks/network_results/csn_semsim.txt", header = TRUE) #get nodes in network #../data/totalNetwork.txt or ../sem_sim/semsimNet.txt for semanitic sim 
nodes<-unique(union(net$from, net$to))

####READ AND MERGE####
replicates <- c(1:3) #number of replicates 
if(exists("treated")){rm(treated)}
#for every replicate 
for (rep in replicates){
  rep.path <- paste("~/barker_et_al_2021/HMS-LINCS-Mills-2015/Replicate-", 
                    rep, 
                    "/",
                    cell.line,
                    "/Data/",  
                    sep = "")
  
  #for every experiment in replicate  
  for (experiment in list.files(rep.path)){
    to.add <- read.delim(paste(rep.path, experiment, sep = ""), 
                         header = TRUE,
                         stringsAsFactors = FALSE)
    to.add <- data.frame(to.add, Replicate = rep, File = experiment)
    if(exists("treated")){treated <-rbind(treated, to.add)}
    else {treated <- to.add}
  }
}

#replace HMS LINC IDs with just LINC IDS
treated$ID[grep("HMS", treated$ID)]<-gsub("[^0-9.-]", "", treated$ID[grep("HMS", treated$ID)]) 

ctrls_total<-treated[treated$ID == "ctrl",] #get controls 
ctrls<-ctrls_total[ctrls_total$Treatment02 == "Null" | ctrls_total$Treatment02 == "DMSO",] #choose what type of controls you want NOTE: there are positive and negative ones. 
#ctrls<-ctrls_total[ctrls_total$Treatment02 == "TRAIL" & (ctrls_total$Treatment03 == "1ug/ml"),]#  | ctrls$Treatment02 == "Methanol" | ctrls$Treatment02 == "Ethanol"
             
# treated <- treated[treated$Treatment03 == "10uM",] #get drug treatments of one concentration
treated<-treated[treated$ID != "ctrl",] #get controls 

####LOAD DRUG TARGET MAPPINGS####
#HMS LINCS small molecule library.  https://lincs.hms.harvard.edu/db/datasets/20000/

ds_2000 <- read.csv("~/phenotype_networks/data/dataset_20000_20200713081900.csv", header = TRUE)
ds_2000 <- ds_2000[ds_2000$Binding.Class <= 1,] #(Kd <100 uM) use this to set how many targets you want 1 is best 
ds_2000 <- ds_2000 %>% #split the two constituent bits of the id 
  separate(HMSL.Small.Mol.HMS.LINCS.ID, c("ID1", "ID2"), "-")
ds_2000 <- ds_2000 %>%
  group_by(ID1, ID2) %>% #group by those two bits of id and collapse the Kd <100 nM targets into sets. 
  summarize(target_set = str_c(HUGO.Gene.Symbol, collapse = ", "))

####WORK OUT INFLUENCE SCORE FOR EACH DRUG####

library(igraph)
g <- graph.edgelist(as.matrix(net[,1:2]), directed = T)
deg<-page_rank(g) #degree, eigenvector, #pagerank #eccentricity 
deg<-deg$vector
influences<-c()
no.targets<-c()
for (target_set in ds_2000$target_set) {
  jac<-szymkiewicz_simpson(strsplit(target_set,", ")[[1]], nodes) #quant the level of overlap (0 = no overlap) 
  sum.degrees<-sum(deg[nodes[nodes %in% strsplit(target_set,", ")[[1]]]]) #get sum of the degrees/betweenness centrality 
  influence<-sum.degrees*jac
  no.targets<-c(no.targets, length(strsplit(target_set,", ")[[1]])) 
  influences<-c(influences,influence)
}
#plot(density(influences))

#add this information to our drug target mapping file 
ds_2000$ss_distance<-influences
ds_2000$no.targets<-no.targets

####MERGE VARIABLES AND MAPPINGS####

mapped.data<-merge(ds_2000, treated, by.x = "ID1", by.y = "ID") #merge them together through the lincs id 
mapped.data<-mapped.data[mapped.data$no.targets < 5,] #only select cells treated with drugs that have less than 5 targets 
#prep data 
numeric.rel<-data.frame(distance = as.numeric(mapped.data$ss_distance), 
                        id = as.character(mapped.data$ID1), 
                        file = as.character(mapped.data$File),
                        target_sum = as.character(mapped.data$target_set),
                        mapped.data[,grep(property,colnames(mapped.data))],
                        DRAQ5_cyt_INT = mapped.data$Ch1_INT_Cytoplasm_intensity, #DRAQ5 cytoplasmic intensity 
                        DRAQ5_nuc_INT = mapped.data$Ch1_INT_Nucleus_intensity, #DRAQ5 nucleic intensity 
                        TMRE_cyt_INT = mapped.data$Ch2_INT_Cytoplasm_intensity, #TMRE cytoplasmic intensity
                        TMRE_nuc_INT = mapped.data$Ch2_INT_Nucleus_intensity) #TMRE nucleic intensity 
numeric.ctrls<-data.frame(distance = NA, 
                          id = as.character(ctrls$ID), 
                          file = as.character(ctrls$File),
                          target_sum = ctrls$ID,
                          ctrls[,grep(property,colnames(ctrls))],
                          DRAQ5_cyt_INT = ctrls$Ch1_INT_Cytoplasm_intensity, #DRAQ5 cytoplasmic intensity 
                          DRAQ5_nuc_INT = ctrls$Ch1_INT_Nucleus_intensity, #DRAQ5 nucleic intensity 
                          TMRE_cyt_INT = ctrls$Ch2_INT_Cytoplasm_intensity, #TMRE cytoplasmic intensity
                          TMRE_nuc_INT = ctrls$Ch2_INT_Nucleus_intensity) #TMRE nucleic intensity 
numeric.data<-rbind(numeric.rel, numeric.ctrls)
####REMOVE OUTLIERS USING ZSCORE####
#z.score.cutoff<-3
#numeric.data<-numeric.data[!rowSums(apply(numeric.data[,grep(property, colnames(numeric.data))], 2, scale) > z.score.cutoff | 
#                                      apply(numeric.data[,grep(property, colnames(numeric.data))], 2, scale) < -z.score.cutoff),] 


####NORMALISE RELEVANT VARIABLES BY INTENSITY####

#textural nucleur features by nucleus DRAQ5 intensity 
numeric.data[,grep("Ch1_TXT_Nucleus_", colnames(numeric.data))]<-numeric.data[,grep("Ch1_TXT_Nucleus_", colnames(numeric.data))]/numeric.data$DRAQ5_nuc_INT
#textural cytoplasmic features by cytoplasmic DRAQ5 intensity 
numeric.data[,grep("Ch1_TXT_Cytoplasm_", colnames(numeric.data))]<-numeric.data[,grep("Ch1_TXT_Cytoplasm_", colnames(numeric.data))]/numeric.data$DRAQ5_cyt_INT
#textural nucleur features by nucleus TMRE intensity 
numeric.data[,grep("Ch2_TXT_Nucleus_", colnames(numeric.data))]<-numeric.data[,grep("Ch2_TXT_Nucleus_", colnames(numeric.data))]/numeric.data$TMRE_nuc_INT
#textural cytoplasmic features by cytoplasmic TMRE intensity 
numeric.data[,grep("Ch2_TXT_Cytoplasm_", colnames(numeric.data))]<-numeric.data[,grep("Ch2_TXT_Cytoplasm_", colnames(numeric.data))]/numeric.data$TMRE_cyt_INT
#normalise morphological spots based on their respective reagent intensity. 
if(property == "_MOR_"){
  numeric.data$Ch1_MOR_Cytoplasm_Spots_Small_Count<-numeric.data$Ch1_MOR_Cytoplasm_Spots_Small_Count/numeric.data$DRAQ5_cyt_INT #is one of these for nucleur and cyto staining? 
  numeric.data$Ch2_MOR_Cytoplasm_Spots_Small_Count<-numeric.data$Ch2_MOR_Cytoplasm_Spots_Small_Count/numeric.data$TMRE_cyt_INT
  numeric.data$Ch1_MOR_Nucleus_Spots_Small_Count<-numeric.data$Ch1_MOR_Nucleus_Spots_Small_Count/numeric.data$DRAQ5_nuc_INT
  numeric.data$Ch2_MOR_Nucleus_Spots_Small_Count<-numeric.data$Ch2_MOR_Nucleus_Spots_Small_Count/numeric.data$TMRE_nuc_INT
  numeric.data$x_MOR_Cytoplasm_Spots_Smll_Count<-rowMeans(data.frame(numeric.data$Ch1_MOR_Cytoplasm_Spots_Small_Count, numeric.data$Ch2_MOR_Cytoplasm_Spots_Small_Count),na.rm = T)
  numeric.data$x_MOR_Nucleus_Spots_Smll_Count<-rowMeans(data.frame(numeric.data$Ch1_MOR_Nucleus_Spots_Small_Count, numeric.data$Ch2_MOR_Nucleus_Spots_Small_Count),na.rm = T)
}

numeric.data[,grep("_Spots_Small_Count", colnames(numeric.data))]<-NULL
numeric.data[,grep("_INT", colnames(numeric.data))]<-NULL

#textural cytoplasmic features by cytoplasmic average intensity 

#remove perim2area as these are variables derived from other variables 
numeric.data[,grep("perim2area", colnames(numeric.data))] <- NULL

#this is so we can normalise them per batch 
split.nd <- split(numeric.data, f =numeric.data$file )
names(split.nd)<-str_remove(names(split.nd), cell.line) #remove the cell line from the string 
names(split.nd)<-sub(".*(.)-(.).*","\\1\\2",names(split.nd)) #get the two numers either side of "-". these correspond to the replcate and batch 
split.nd <- lapply(split(split.nd, names(split.nd)), function(x) do.call(rbind, x)) #merge files of the same replicate and batch, so we can perform lfc on them seperately 

suppressWarnings(rm(nz.sep))
for (df in split.nd) {
  #LOG FOLD change relative to MEDIAN of CONTROLs
  lfc.df<-data.frame(lincs.id = df$id, distance = df$distance, apply(df[,grep(property, colnames(df))], 2,  function(x) log10(x/(median(x[df$id == "ctrl"],na.rm = T)+0.01))))
  add.df<-lfc.df
  if (exists("nz.sep")) { nz.sep<-rbind(nz.sep, add.df)}
  else {nz.sep <- add.df}
}

#z.data<-data.frame(lincs.id = numeric.data$id, distance = numeric.data$distance, apply(numeric.data[,grep(property, colnames(numeric.data))], 2, scale))
z.data<-nz.sep # use either nz.sep or z.data. the former is data normalised per batch and the latter is it normalised all together 
melt.z.data<-reshape2::melt(z.data, id=c("lincs.id","distance")) 
melt.z.data<-melt.z.data[which(!is.infinite(melt.z.data$value)),] #get rid of infinite values 

#figure of log-fold changes 
fig4<-ggplot(melt.z.data, aes(x=reorder(lincs.id, (0-value)^2), y=value, fill = log10(as.numeric(distance)+0.00000001))) +
  geom_boxplot(position=position_dodge(1), ) + theme_minimal() + 
  facet_wrap(. ~ variable, scales='free_y',nrow = 2) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), panel.grid.major = element_blank(),axis.line = element_line()) + 
  scale_fill_continuous(type = "viridis") + 
  scale_y_continuous(expand = expansion(mult = c(0.1, 0.8))) 
fig4
#variable plot 


library(data.table)
variance_drug<-lapply(unique(melt.z.data$lincs.id), function(y){lapply(unique(melt.z.data$variable), function(x){median(melt.z.data$value[melt.z.data$lincs.id == y & melt.z.data$variable == x])})})
names(variance_drug)<-unique(melt.z.data$lincs.id)
for (drug in names(variance_drug)) {
  names(variance_drug[[drug]])<-unique(melt.z.data$variable)
}
drug_var_df<-data.frame(rbindlist(variance_drug))
drug_var_df$id<-as.character(unique(melt.z.data$lincs.id))
molt_drug_var <- reshape2::melt(drug_var_df, id.vars=c("id"))
molt_drug_var$fill <- molt_drug_var$id == "ctrl"
molt_drug_var$distance<-ds_2000$ss_distance[match(molt_drug_var$id , ds_2000$ID1)]
molt_drug_var$in_out <- molt_drug_var$distance == 0 | is.na(molt_drug_var$distance)
molt_drug_var$kinase_targets<-ds_2000$target_set[match(molt_drug_var$id, ds_2000$ID1)]
library(ggrepel)
# Change the position
molt_drug_var<-molt_drug_var[complete.cases(molt_drug_var), ]
is_outlier <- function(x) {
  return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}
molt_drug_var$combined_variable <- paste(molt_drug_var$variable, molt_drug_var$in_out, sep = "__")
dat <- molt_drug_var %>% tibble::rownames_to_column(var="outlier") %>% group_by(variable) %>% mutate(is_outlier=ifelse(is_outlier(value), value, as.numeric(NA)))
dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
dat$outlier[!is.na(dat$outlier)]<-dat$kinase_targets[!is.na(dat$outlier)]
p<-ggplot(dat, aes(x=value, y=variable, fill=in_out, colour = in_out)) +
  geom_boxplot(position=position_dodge(1)) +
  geom_point(position=position_jitterdodge(jitter.width = 0.2))  + geom_label_repel(aes(label = outlier,fill = factor(in_out)), color = 'black', size = 6, box.padding = .7) +
  theme(legend.position = "bottom")
p



KS_test_var<-molt_drug_var
KS_test_var$IN<-KS_test_var$distance!=0
KS_test_var<-KS_test_var[!is.na(KS_test_var$value),]
KS_test_var<-KS_test_var[!is.na(KS_test_var$IN),]

KS_results<-list()
for (variable in unique(KS_test_var$variable)) {
  KS_results[[variable]] = ks.test(x = KS_test_var$value[KS_test_var$variable == variable & KS_test_var$IN ==T],
                                   y = KS_test_var$value[KS_test_var$variable == variable & KS_test_var$IN ==F])
}

#t.test with all the drugs vs the control 
df.pval.results<-c()
for (variable in unique(melt.z.data$variable)) {
  variance<-c()
  dis<-c()
  drg<-c()
  for (drug in unique(melt.z.data$lincs.id)) {
    mean.difference <- mean(melt.z.data$value[melt.z.data$variable == variable & melt.z.data$lincs.id == drug], na.rm = T)# -
     # mean(melt.z.data$value[melt.z.data$variable == variable & melt.z.data$lincs.id == "ctrl"], na.rm = T)
    var.drug<-var(melt.z.data$value[melt.z.data$variable == variable & melt.z.data$lincs.id == drug])
    variance<-c(variance, sqrt((mean.difference)^2))
    distscor<-(unique(melt.z.data$distance[melt.z.data$variable == variable & melt.z.data$lincs.id == drug]))
    dis<-c(dis, distscor)
    drg<-c(drg, drug)
  }
  pval.results<-data.frame(dis, variance, variable, drg)
  df.pval.results <- rbind(df.pval.results, pval.results)
}
#anova and Tukey multiple comparison of means - slightly better 
a1 <- aov(value ~ lincs.id, data = melt.z.data[melt.z.data$variable == "Ch1_MOR_cytoplasm_area", ])
tky.anva<-TukeyHSD(a1) # where fit comes from aov() 
df.anva<-data.frame(tky.anva$lincs.id)
df.anva$fst<-unlist(strsplit(rownames(df.anva), "-"))[c(T,F)]
df.anva$scd<-unlist(strsplit(rownames(df.anva), "-"))[c(F,T)]
df.anva$lpv<--log2(df.anva$p.adj+0.0001)
df.anva <- merge(x = data.frame(ds_2000$ID1, ds_2000$ss_distance), #combine drug distances...
                 y = df.anva[df.anva$fst == "ctrl",], #... with the tukey adjusted p values 
                 by.x = "ds_2000.ID1", 
                 by.y = "scd")

#whereas in the last figure - each dot represents a cell, here each dot represents a drug (so a collection of cells) - the mean difference of 
#all the cell treated by that drug - minus the mean of 
ggplot(df.pval.results, aes(dis, variance)) +
  geom_point() +
  geom_smooth(method="lm") +
  facet_wrap(~ variable,scales = "free", nrow = 2) +
  theme_minimal() + theme( axis.line = element_line(colour = "black", size = .5, linetype = "solid"), 
                           strip.text.x = element_blank(),
                           panel.grid.major = element_blank(),
                           panel.grid.minor = element_blank())

subset.results<-df.pval.results[df.pval.results$variable == unique(df.pval.results$variable)[1],]
subset.results$targets<-ds_2000$target_set[match(subset.results$drg, ds_2000$ID1)]
ggplot(subset.results, aes(x= dis, y= variance, colour="green", label=targets))+
  geom_point() +geom_text(aes(label=targets),hjust=0, vjust=0)

df.pval.results$infl <- as.character(df.pval.results$dis > as.numeric(quantile(df.pval.results$dis , .8, na.rm = T)))
df.pval.results$in_or_no <- df.pval.results$dis != 0


ggplot(df.pval.results[which(!is.na(df.pval.results$in_or_no)),], aes(x=variable, y=variance, fill=in_or_no)) +
  geom_boxplot(position=position_dodge(0.8))+
  geom_jitter(position=position_dodge(0.8))+theme_minimal()  + theme( axis.line = element_line(colour = "black", size = .5, linetype = "solid"))
