# The script implements a strategy to control for one or more variables when making comparisons between sets
# When comparing a feature (be it a proportion a count or any measured value ecc.) of a sample of interest to a control population
# is often desirable to make the comparison controlling for any known variable affecting the investigated feature  
# Specifically, through a value-matching strategy a subset of the ctr population is taken whose 
# distribution of the variable we control for is matched to its corresponding distribution in the sample 
# Set paths to files (line 10, 55, 176, 299)
library("ggpubr")
library("ggthemes")
# Load data from latest IES table
IES_tab <- read.table(file = 'path to IES table', 
                      h = T, stringsAsFactors = F, sep = "\t", quote = "")


# Possible factors affecting the Cin_score are genomic location, IES size and expression levels
#***********************************************************************************************************
## Comparison of Cin_scores controlling for IES size and gene expression
#***********************************************************************************************************
# Create a ctr df with PGM IESs and the features of interest
PGM_df <- IES_tab[, c(2, 3, 4, 5, 16, 13, 19, 17, 18, 14, 15, 20, 21, 99, 100, 119, 122)]
PGM_df <- PGM_df[!duplicated(PGM_df$IES_ID) &
                   !is.na(PGM_df$IES_ID) &
                   PGM_df$IES_LOCATION=="EXON", ]

# Avarage TPM values
PGM_df$TPM_25_mean_D12 <- apply(PGM_df[, 14:15], 1, mean)
PGM_df$set <- "PGM"

PGM_df1 <- PGM_df
PGM_df2 <- PGM_df
PGM_df3 <- PGM_df



#***************************************************************************************************
#********************************* Value-matching normalization ************************************
#***************************************************************************************************
## 32F1 Create Sample df, somatic IESs (IRS > 0.1)
#****************************************************************
# sample df
sample_df_32F1 <- PGM_df[PGM_df$IRS_32F1 > 0.1 &
                           (PGM_df$IES_PLUS_32F1 + PGM_df$IES_MINUS_32F1) >= 20, ]
# reshape dfs
sample_df_32F1 <- sample_df_32F1[, -c(4:15)]
sample_df_32F1 <- sample_df_32F1[, c(1:3, 6, 4:5, 7)]
sample_df_32F1$set <- "32F1"
PGM_df <- PGM_df[, -c(4:15)]
PGM_df <- PGM_df[, c(1:3, 6, 4:5, 7)]

# subtract sample IES_ID from PGM set
length(setdiff(PGM_df$IES_ID, sample_df_32F1$IES_ID))
PGM_df <- PGM_df[which(PGM_df$IES_ID%in%setdiff(PGM_df$IES_ID, sample_df_32F1$IES_ID)),]


# Read normalized ctr sample from file
ctr_df_32 <- read.csv(file = "path to norm_ctr_32_somatic.txt", h = T, stringsAsFactors = F)
# Run the block below to draw the normalized sample
#****************************************************************
# create empty df
#ctr_df_32 <- data.frame(matrix(nrow = 1, ncol =  length(names(PGM_df))))
#colnames(ctr_df_32) <- names(PGM_df)

#for(i in 1:length(sample_df_32F1$IES_ID)) {
#  for(j in 1:length(PGM_df$IES_ID)) {
#    if(PGM_df[, 3][j] <= (sample_df_32F1[, 3][i] +  sample_df_32F1[, 3][i]*0.1) &
#       PGM_df[, 3][j] >= (sample_df_32F1[, 3][i] -  sample_df_32F1[, 3][i]*0.1) &
#       PGM_df[, 4][j] <= (sample_df_32F1[, 4][i] +  sample_df_32F1[, 4][i]*0.1) &
#       PGM_df[, 4][j] >= (sample_df_32F1[, 4][i] -  sample_df_32F1[, 4][i]*0.1) &
#       PGM_df[, 2][j] <= (sample_df_32F1[, 2][i] +  sample_df_32F1[, 2][i]*0.1) &
#       PGM_df[, 2][j] >= (sample_df_32F1[, 2][i] -  sample_df_32F1[, 2][i]*0.1)){
#      ctr_df_32[i, ] <- PGM_df[j, ] 
      
#    }
#  }
#}

#ctr_df_32 <- ctr_df_32[!is.na(ctr_df_32$IES_ID), ]

# Check ID Overlap between normalized ctr and sample
#ctr_df_32$IES_ID%in%sample_df_32F1$IES_ID

# Write norm-ctr to file
#write.csv(ctr_df_32, file = "../cataniastudent/Desktop/norm_ctr_32.txt", row.names = F, quote = F)
#****************************************************************

# size and TPM distribution
## Size
plot(density(log2(sample_df_32F1$IES_LEN), bw = 0.05),  col = "dark red", type = "l")
points(density(log2(PGM_df$IES_LEN), bw = 0.05), type = "l")
summary(sample_df_32F1$IES_LEN)
summary(PGM_df$IES_LEN)

## TPM
plot(density(log2(PGM_df$TPM_25_mean_D12)))
points(density(log2(sample_df_32F1$TPM_25_mean_D12)), col = "dark red", type = "l")
summary(sample_df_32F1$TPM_25_mean_D12)
summary(PGM_df$TPM_25_mean_D12)

## Cin_score
### All
plot(density(PGM_df$CIN_SCORE), type = "l")
points(density(sample_df_32F1$CIN_SCORE), type = "l", col = "dark red")
summary(sample_df_32F1$CIN_SCORE)
summary(PGM_df$CIN_SCORE)
### Short
plot(density(PGM_df$CIN_SCORE[PGM_df$IES_LEN < 30]), type = "l")
points(density(sample_df_32F1$CIN_SCORE[sample_df_32F1$IES_LEN < 30]), type = "l", col = "dark red")
summary(sample_df_32F1$CIN_SCORE[sample_df_32F1$IES_LEN < 30])
summary(PGM_df$CIN_SCORE[PGM_df$IES_LEN < 30])

### Long
plot(density(PGM_df$CIN_SCORE[PGM_df$IES_LEN > 200]), type = "l")
points(density(sample_df_32F1$CIN_SCORE[sample_df_32F1$IES_LEN > 200]), type = "l", col = "dark red")
summary(sample_df_32F1$CIN_SCORE[sample_df_32F1$IES_LEN > 200])
summary(PGM_df$CIN_SCORE[PGM_df$IES_LEN > 200])

# size and TPM distribution after value-matching normalization
## Size
plot(density(log2(sample_df_32F1$IES_LEN), bw = 0.05),  col = "dark red", type = "l")
points(density(log2(ctr_df_32$IES_LEN), bw = 0.05), type = "l")
summary(sample_df_32F1$IES_LEN)
summary(ctr_df_32$IES_LEN)

## TPM
plot(density(log2(ctr_df_32$TPM_25_mean_D12)))
points(density(log2(sample_df_32F1$TPM_25_mean_D12)), col = "dark red", type = "l")
summary(sample_df_32F1$TPM_25_mean_D12)
summary(ctr_df_32$TPM_25_mean_D12)

## Cin_score
### All
plot(density(ctr_df_32$CIN_SCORE), type = "l")
points(density(sample_df_32F1$CIN_SCORE), type = "l", col = "dark red")
summary(sample_df_32F1$CIN_SCORE)
summary(ctr_df_32$CIN_SCORE)

### Short
plot(density(ctr_df_32$CIN_SCORE[ctr_df_32$IES_LEN < 30]), type = "l")
points(density(sample_df_32F1$CIN_SCORE[sample_df_32F1$IES_LEN < 30]), type = "l", col = "dark red")
summary(sample_df_32F1$CIN_SCORE[sample_df_32F1$IES_LEN < 30])
summary(ctr_df_32$CIN_SCORE[ctr_df_32$IES_LEN < 30])

### Long
plot(density(ctr_df_32$CIN_SCORE[ctr_df_32$IES_LEN > 200]), type = "l")
points(density(sample_df_32F1$CIN_SCORE[sample_df_32F1$IES_LEN > 200]), type = "l", col = "dark red")
summary(sample_df_32F1$CIN_SCORE[sample_df_32F1$IES_LEN > 200])
summary(ctr_df_32$CIN_SCORE[ctr_df_32$IES_LEN > 200])

# epiIES fraction for sample
length(sample_df_32F1$IES_ID[sample_df_32F1$DCL2_3_SENSITIVE_BIOMART == "True" |
                               sample_df_32F1$DCL5_SENSITIVE_BIOMART == "True"]) / length(sample_df_32F1$IES_ID)
# epiIES fraction for norm-ctr
length(ctr_df_32$IES_ID[ctr_df_32$DCL2_3_SENSITIVE_BIOMART == "True" |
                          ctr_df_32$DCL5_SENSITIVE_BIOMART == "True"]) / length(ctr_df_32$IES_ID)

summary(sample_df_32F1)
summary(ctr_df_32)
summary(PGM_df)

## 18F1 Create Sample df, somatic IESs (IRS > 0.1)
#****************************************************************
# sample df
sample_df_18F1 <- PGM_df1[PGM_df1$IRS_18F1 > 0.1 &
                            (PGM_df1$IES_PLUS_18F1 + PGM_df1$IES_MINUS_18F1) >= 20, ]
# reshape dfs
sample_df_18F1 <- sample_df_18F1[, -c(4:15)]
sample_df_18F1 <- sample_df_18F1[, c(1:3, 6, 4:5, 7)]
sample_df_18F1$set <- "18F1"
PGM_df <- PGM_df1[, -c(4:15)]
PGM_df <- PGM_df[, c(1:3, 6, 4:5, 7)]

# subtract sample IES_ID from PGM set
length(setdiff(PGM_df$IES_ID, sample_df_18F1$IES_ID))
PGM_df <- PGM_df[which(PGM_df$IES_ID%in%setdiff(PGM_df$IES_ID, sample_df_18F1$IES_ID)),]

# Read normalized ctr sample from file
ctr_df_18 <- read.csv(file = "path to norm_ctr_18_somatic.txt", h = T, stringsAsFactors = F)
# Run the block below to draw the normalized sample
#****************************************************************
# create empty df
#ctr_df_18 <- data.frame(matrix(nrow = 1, ncol =  length(names(PGM_df))))
#colnames(ctr_df_18) <- names(PGM_df)

#for(i in 1:length(sample_df_18F1$IES_ID)) {
#  for(j in 1:length(PGM_df$IES_ID)) {
#    if(PGM_df[, 3][j] <= (sample_df_18F1[, 3][i] +  sample_df_18F1[, 3][i]*0.1) &
#       PGM_df[, 3][j] >= (sample_df_18F1[, 3][i] -  sample_df_18F1[, 3][i]*0.1) &
#       PGM_df[, 4][j] <= (sample_df_18F1[, 4][i] +  sample_df_18F1[, 4][i]*0.1) &
#       PGM_df[, 4][j] >= (sample_df_18F1[, 4][i] -  sample_df_18F1[, 4][i]*0.1) &
#       PGM_df[, 2][j] <= (sample_df_18F1[, 2][i] +  sample_df_18F1[, 2][i]*0.1) &
#       PGM_df[, 2][j] >= (sample_df_18F1[, 2][i] -  sample_df_18F1[, 2][i]*0.1)){
#      ctr_df_18[i, ] <- PGM_df[j, ] 
#      
#    }
#  }
#}

#ctr_df_18 <- ctr_df_18[!is.na(ctr_df_18$IES_ID), ]

# Check ID Overlap between normalized ctr and sample
#ctr_df_18$IES_ID%in%sample_df_18F1$IES_ID

# Write norm-ctr to file
#write.csv(ctr_df_18, file = "../cataniastudent/Desktop/norm_ctr_18.txt", row.names = F, quote = F)
#****************************************************************

# size and TPM distribution
## Size
plot(density(log2(sample_df_18F1$IES_LEN), bw = 0.05),  col = "dark green", type = "l")
points(density(log2(PGM_df$IES_LEN), bw = 0.05), type = "l")
summary(sample_df_18F1$IES_LEN)
summary(PGM_df$IES_LEN)
## TPM
plot(density(log2(PGM_df$TPM_25_mean_D12)))
points(density(log2(sample_df_18F1$TPM_25_mean_D12)), col = "dark green", type = "l")
summary(sample_df_18F1$TPM_25_mean_D12)
summary(PGM_df$TPM_25_mean_D12)

## Cin_score
### All
plot(density(PGM_df$CIN_SCORE), type = "l")
points(density(sample_df_18F1$CIN_SCORE), type = "l", col = "dark green")
summary(sample_df_18F1$CIN_SCORE)
summary(PGM_df$CIN_SCORE)
### Short
plot(density(PGM_df$CIN_SCORE[PGM_df$IES_LEN < 30]), type = "l")
points(density(sample_df_18F1$CIN_SCORE[sample_df_18F1$IES_LEN < 30]), type = "l", col = "dark green")
summary(sample_df_18F1$CIN_SCORE[sample_df_18F1$IES_LEN < 30])
summary(PGM_df$CIN_SCORE[PGM_df$IES_LEN < 30])

### Long
plot(density(sample_df_18F1$CIN_SCORE[sample_df_18F1$IES_LEN > 200]), type = "l", col = "dark green")
points(density(PGM_df$CIN_SCORE[PGM_df$IES_LEN > 200]), type = "l")
summary(sample_df_18F1$CIN_SCORE[sample_df_18F1$IES_LEN > 200])
summary(PGM_df$CIN_SCORE[PGM_df$IES_LEN > 200])

# size and TPM distribution after value-matching normalization
## Size
plot(density(log2(ctr_df_18$IES_LEN), bw = 0.05), type = "l")
points(density(log2(sample_df_18F1$IES_LEN), bw = 0.05),  col = "dark green", type = "l")
summary(sample_df_18F1$IES_LEN)
summary(ctr_df_18$IES_LEN)

## TPM
plot(density(log2(ctr_df_18$TPM_25_mean_D12)))
points(density(log2(sample_df_18F1$TPM_25_mean_D12)), col = "dark green", type = "l")
summary(sample_df_18F1$TPM_25_mean_D12)
summary(ctr_df_18$TPM_25_mean_D12)

## Cin_score
### All
plot(density(ctr_df_18$CIN_SCORE), type = "l")
points(density(sample_df_18F1$CIN_SCORE), type = "l", col = "dark green")
summary(sample_df_18F1$CIN_SCORE)
summary(ctr_df_18$CIN_SCORE)

### Short
plot(density(ctr_df_18$CIN_SCORE[ctr_df_18$IES_LEN < 30]), type = "l")
points(density(sample_df_18F1$CIN_SCORE[sample_df_18F1$IES_LEN < 30]), type = "l", col = "dark green")
summary(sample_df_18F1$CIN_SCORE[sample_df_18F1$IES_LEN < 30])
summary(ctr_df_18$CIN_SCORE[ctr_df_18$IES_LEN < 30])

### Long
plot(density(sample_df_18F1$CIN_SCORE[sample_df_18F1$IES_LEN > 200]), type = "l", col = "dark green")
points(density(ctr_df_18$CIN_SCORE[ctr_df_18$IES_LEN > 200]), type = "l")

summary(sample_df_18F1$CIN_SCORE[sample_df_18F1$IES_LEN > 200])
summary(ctr_df_18$CIN_SCORE[ctr_df_18$IES_LEN > 200])

# epiIES fraction for sample
length(sample_df_18F1$IES_ID[sample_df_18F1$DCL2_3_SENSITIVE_BIOMART == "True" |
                               sample_df_18F1$DCL5_SENSITIVE_BIOMART == "True"]) / length(sample_df_18F1$IES_ID)
# epiIES fraction for norm-ctr
length(ctr_df_18$IES_ID[ctr_df_18$DCL2_3_SENSITIVE_BIOMART == "True" |
                          ctr_df_18$DCL5_SENSITIVE_BIOMART == "True"]) / length(ctr_df_18$IES_ID)

summary(sample_df_18F1)
summary(ctr_df_18)
summary(PGM_df)

## 25F1 Create Sample df, somatic IESs (IRS > 0.1)
#****************************************************************
# sample df
sample_df_25F1 <- PGM_df2[PGM_df2$IRS_25F1 > 0.1 &
                            (PGM_df2$IES_PLUS_25F1 + PGM_df2$IES_MINUS_25F1) >= 20, ]

# reshape dfs
sample_df_25F1 <- sample_df_25F1[, -c(4:15)]
sample_df_25F1 <- sample_df_25F1[, c(1:3, 6, 4:5, 7)]
sample_df_25F1$set <- "25F1"
PGM_df <- PGM_df2[, -c(4:15)]
PGM_df <- PGM_df[, c(1:3, 6, 4:5, 7)]

# subtract sample IES_ID from PGM set
length(setdiff(PGM_df$IES_ID, sample_df_25F1$IES_ID))
PGM_df <- PGM_df[which(PGM_df$IES_ID%in%setdiff(PGM_df$IES_ID, sample_df_25F1$IES_ID)),]


# Read normalized ctr sample from file
ctr_df_25 <- read.csv(file = "path to norm_ctr_25_somatic.txt", h = T, stringsAsFactors = F)
# Run the block below to draw the normalized sample
#****************************************************************
# create empty df
#ctr_df_25 <- data.frame(matrix(nrow = 1, ncol =  length(names(PGM_df))))
#colnames(ctr_df_25) <- names(PGM_df)

#for(i in 1:length(sample_df_25F1$IES_ID)) {
#  for(j in 1:length(PGM_df$IES_ID)) {
#    if(PGM_df[, 3][j] <= (sample_df_25F1[, 3][i] +  sample_df_25F1[, 3][i]*0.1) &
#       PGM_df[, 3][j] >= (sample_df_25F1[, 3][i] -  sample_df_25F1[, 3][i]*0.1) &
#       PGM_df[, 4][j] <= (sample_df_25F1[, 4][i] +  sample_df_25F1[, 4][i]*0.1) &
#       PGM_df[, 4][j] >= (sample_df_25F1[, 4][i] -  sample_df_25F1[, 4][i]*0.1) &
#       PGM_df[, 2][j] <= (sample_df_25F1[, 2][i] +  sample_df_25F1[, 2][i]*0.1) &
#       PGM_df[, 2][j] >= (sample_df_25F1[, 2][i] -  sample_df_25F1[, 2][i]*0.1)){
#      ctr_df_25[i, ] <- PGM_df[j, ] 
#      
#    }
#  }
#}

#ctr_df_25 <- ctr_df_25[!is.na(ctr_df_25$IES_ID), ]

# Check ID Overlap between normalized ctr and sample
#ctr_df_25$IES_ID%in%sample_df_25F1$IES_ID

# Write norm-ctr to file
#write.csv(ctr_df_25, file = "../cataniastudent/Desktop/norm_ctr_25.txt", row.names = F, quote = F)
#****************************************************************

# size and TPM distribution
## Size
plot(density(log2(sample_df_25F1$IES_LEN), bw = 0.05),  col = "dark blue", type = "l")
points(density(log2(PGM_df$IES_LEN), bw = 0.05), type = "l")
summary(sample_df_25F1$IES_LEN)
summary(PGM_df$IES_LEN)

## TPM
plot(density(log2(PGM_df$TPM_25_mean_D12)))
points(density(log2(sample_df_25F1$TPM_25_mean_D12)), col = "dark blue", type = "l")
summary(sample_df_25F1$TPM_25_mean_D12)
summary(PGM_df$TPM_25_mean_D12)

## Cin_score
### All
plot(density(PGM_df$CIN_SCORE), type = "l")
points(density(sample_df_25F1$CIN_SCORE), type = "l", col = "dark blue")
summary(sample_df_25F1$CIN_SCORE)
summary(PGM_df$CIN_SCORE)

### Short
plot(density(PGM_df$CIN_SCORE[PGM_df$IES_LEN < 30]), type = "l")
points(density(sample_df_25F1$CIN_SCORE[sample_df_25F1$IES_LEN < 30]), type = "l", col = "dark blue")
summary(sample_df_25F1$CIN_SCORE[sample_df_25F1$IES_LEN < 30])
summary(PGM_df$CIN_SCORE[PGM_df$IES_LEN < 30])

### Long
plot(density(PGM_df$CIN_SCORE[PGM_df$IES_LEN > 200]), type = "l")
points(density(sample_df_25F1$CIN_SCORE[sample_df_25F1$IES_LEN > 200]), type = "l", col = "dark blue")
summary(sample_df_25F1$CIN_SCORE[sample_df_25F1$IES_LEN > 200])
summary(PGM_df$CIN_SCORE[PGM_df$IES_LEN > 200])

# size and TPM distribution after value-matching normalization
## Size
plot(density(log2(ctr_df_25$IES_LEN), bw = 0.05), type = "l")
points(density(log2(sample_df_25F1$IES_LEN), bw = 0.05),  col = "dark blue", type = "l")
summary(sample_df_25F1$IES_LEN)
summary(ctr_df_25$IES_LEN)

## TPM
plot(density(log2(ctr_df_25$TPM_25_mean_D12)))
points(density(log2(sample_df_25F1$TPM_25_mean_D12)), col = "dark blue", type = "l")
summary(sample_df_25F1$TPM_25_mean_D12)
summary(ctr_df_25$TPM_25_mean_D12)

## Cin_score
### All
plot(density(ctr_df_25$CIN_SCORE), type = "l")
points(density(sample_df_25F1$CIN_SCORE), type = "l", col = "dark blue")
summary(sample_df_25F1$CIN_SCORE)
summary(ctr_df_25$CIN_SCORE)

### Short
plot(density(ctr_df_25$CIN_SCORE[ctr_df_25$IES_LEN < 30]), type = "l")
points(density(sample_df_25F1$CIN_SCORE[sample_df_25F1$IES_LEN < 30]), type = "l", col = "dark blue")
summary(sample_df_25F1$CIN_SCORE[sample_df_25F1$IES_LEN < 30])
summary(ctr_df_25$CIN_SCORE[ctr_df_25$IES_LEN < 30])

### Long
plot(density(ctr_df_25$CIN_SCORE[ctr_df_25$IES_LEN > 100]), type = "l")
points(density(sample_df_25F1$CIN_SCORE[sample_df_25F1$IES_LEN > 100]), type = "l", col = "dark blue")
summary(sample_df_25F1$CIN_SCORE[sample_df_25F1$IES_LEN > 100])
summary(ctr_df_25$CIN_SCORE[ctr_df_25$IES_LEN > 100])

# epiIES fraction for sample
length(sample_df_25F1$IES_ID[sample_df_25F1$DCL2_3_SENSITIVE_BIOMART == "True" |
                               sample_df_25F1$DCL5_SENSITIVE_BIOMART == "True"]) / length(sample_df_25F1$IES_ID)
# epiIES fraction for norm-ctr
length(ctr_df_25$IES_ID[ctr_df_25$DCL2_3_SENSITIVE_BIOMART == "True" |
                          ctr_df_25$DCL5_SENSITIVE_BIOMART == "True"]) / length(ctr_df_25$IES_ID)

summary(sample_df_25F1)
summary(ctr_df_25)
summary(PGM_df)

#***************************************************************************************************
#************************ Prportion of epiIES before and after normalization ***********************
#***************************************************************************************************
# Prep experimental data frame (sort tabs and bind)
# ------------ stacked bar chart --------------
library("stringr")
sample_df_18F1$Temp <- "18°C"
sample_df_25F1$Temp <- "25°C"
sample_df_32F1$Temp <- "32°C"

df <- rbind(sample_df_18F1, sample_df_25F1, sample_df_32F1)
test <- df
test$JOIN <- paste(test$DCL2_3_SENSITIVE_BIOMART, test$DCL5_SENSITIVE_BIOMART, sep = "")
test$DCL <- str_replace(test$JOIN, "FalseFalse", "Dcl2/3- | Dcl5-")
test$DCL <- str_replace(test$DCL, "FalseTrue", "Dcl2/3- | Dcl5+")
test$DCL <- str_replace(test$DCL, "TrueTrue", "Dcl2/3+ | Dcl5+")
test$DCL <- str_replace(test$DCL, "TrueFalse", "Dcl2/3+ | Dcl5-")

# Prep mock dataframe with random samples
smart_PGM <- PGM_df 
smart_PGM$JOIN <- paste(smart_PGM$DCL2_3_SENSITIVE_BIOMART, smart_PGM$DCL5_SENSITIVE_BIOMART, sep = "")
smart_PGM$DCL <- str_replace(smart_PGM$JOIN, "FalseFalse", "Dcl2/3- | Dcl5-")
smart_PGM$DCL <- str_replace(smart_PGM$DCL, "FalseTrue", "Dcl2/3- | Dcl5+")
smart_PGM$DCL <- str_replace(smart_PGM$DCL, "TrueTrue", "Dcl2/3+ | Dcl5+")
smart_PGM$DCL <- str_replace(smart_PGM$DCL, "TrueFalse", "Dcl2/3+ | Dcl5-")


Rando_18 <- sample(smart_PGM$IES_ID, length(test$IES_ID[test$set=="18F1"]))
Rando_18_df <- smart_PGM[which(smart_PGM$IES_ID%in%Rando_18),]
Rando_18_df$Temp <- "18°C"

Rando_25 <- sample(smart_PGM$IES_ID, length(test$IES_ID[test$set=="25F1"]))
Rando_25_df <- smart_PGM[which(smart_PGM$IES_ID%in%Rando_25),]
Rando_25_df$Temp <- "25°C"

Rando_32 <- sample(smart_PGM$IES_ID, length(test$IES_ID[test$set=="32F1"]))
Rando_32_df <- smart_PGM[which(smart_PGM$IES_ID%in%Rando_32),]
Rando_32_df$Temp <- "32°C"

df_rando <- rbind(Rando_18_df, Rando_25_df, Rando_32_df)

# Prep mock dataframe with value-matched samples (normalized)

# Prep combo df
test$SET <- 1
df_rando$SET <- -1

combo_df2 <- rbind(df_rando, test)

## plot a back to back bar chart
# X Axis Breaks and Labels 
brks <- seq(-350, 350, 50)
lbls = as.character(c(seq(350, 0, -50), seq(50, 350, 50)))

bar_before <- ggplot(combo_df2, aes(x = Temp, y = SET, fill = DCL)) +   # Fill column
  geom_bar(stat = "identity", width = 0.85) +   # draw the bars
  scale_y_continuous(breaks = brks,   # Breaks
                     labels = lbls) + # Labels
  coord_flip() +  # Flip axes
  labs(title="", 
       subtitle="", 
       caption="") +
  theme_tufte() +  # Tufte theme from ggthemes
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        plot.title = element_text(hjust = .5), 
        axis.ticks = element_blank(), axis.text.x=element_text(size=10, colour = "dark blue"), axis.text.y=element_text(size=20, colour = "black"), axis.title=element_text(size=15),legend.text=element_text(size=15)) +
  scale_fill_brewer(palette = "Dark2") + # Color palette
  geom_line(aes(0, 0)) + geom_hline(yintercept=0, linetype="dashed", color = "black", size=1)

bar_before <- bar_before + rremove("legend")

# Test proportions
#**********************************************************************************************
## 32F1
epi_32 <- length(sample_df_32F1$IES_ID[sample_df_32F1$DCL2_3_SENSITIVE_BIOMART == "True" |
                                         sample_df_32F1$DCL5_SENSITIVE_BIOMART == "True"])

epi_rando_32 <- length(Rando_32_df$IES_ID[Rando_32_df$DCL2_3_SENSITIVE_BIOMART == "True" |
                                            Rando_32_df$DCL5_SENSITIVE_BIOMART == "True"])

trial_32 <- length(sample_df_32F1$IES_ID)
trial_rando_32 <- length(Rando_32)

##25F1
epi_25 <- length(sample_df_25F1$IES_ID[sample_df_25F1$DCL2_3_SENSITIVE_BIOMART == "True" |
                                         sample_df_25F1$DCL5_SENSITIVE_BIOMART == "True"])

epi_rando_25 <- length(Rando_25_df$IES_ID[Rando_25_df$DCL2_3_SENSITIVE_BIOMART == "True" |
                                            Rando_25_df$DCL5_SENSITIVE_BIOMART == "True"])

trial_25 <- length(sample_df_25F1$IES_ID)
trial_rando_25 <- length(Rando_25)

##18F1
epi_18 <- length(sample_df_18F1$IES_ID[sample_df_18F1$DCL2_3_SENSITIVE_BIOMART == "True" |
                                         sample_df_18F1$DCL5_SENSITIVE_BIOMART == "True"])

epi_rando_18 <- length(Rando_18_df$IES_ID[Rando_18_df$DCL2_3_SENSITIVE_BIOMART == "True" |
                                            Rando_18_df$DCL5_SENSITIVE_BIOMART == "True"])

trial_18 <- length(sample_df_18F1$IES_ID)
trial_rando_18 <- length(Rando_18)

prop.test(x = c(epi_32, epi_rando_32), n = c(trial_32, trial_rando_32), alternative = "greater")
prop.test(x = c(epi_25, epi_rando_25), n = c(trial_25, trial_rando_25), alternative = "greater")
prop.test(x = c(epi_18, epi_rando_18), n = c(trial_18, trial_rando_18), alternative = "greater")

#geom_label(x=2, y=150, label="Observed", col = "black", size = 5) #+
#geom_label(x=2, y=-150, label="Expected", col = "black", size = 5)

# Prep mock dataframe with value-matched samples (normalized)
ctr_df_18$Temp <- "18°C"
ctr_df_25$Temp <- "25°C"
ctr_df_32$Temp <- "32°C"
ctr <- rbind(ctr_df_18, ctr_df_25, ctr_df_32)
ctr$JOIN <- paste(ctr$DCL2_3_SENSITIVE_BIOMART, ctr$DCL5_SENSITIVE_BIOMART, sep = "")
ctr$DCL <- str_replace(ctr$JOIN, "FalseFalse", "Dcl2/3- | Dcl5-")
ctr$DCL <- str_replace(ctr$DCL, "FalseTrue", "Dcl2/3- | Dcl5+")
ctr$DCL <- str_replace(ctr$DCL, "TrueTrue", "Dcl2/3+ | Dcl5+")
ctr$DCL <- str_replace(ctr$DCL, "TrueFalse", "Dcl2/3+ | Dcl5-")


# Prep combo df
test$SET <- 1
ctr$SET <- -1

combo_df3 <- rbind(ctr, test)

## plot a back to back bar chart
# X Axis Breaks and Labels 
brks <- seq(-350, 350, 50)
lbls = as.character(c(seq(350, 0, -50), seq(50, 350, 50)))

bar_after <- ggplot(combo_df3, aes(x = Temp, y = SET, fill = DCL)) +   # Fill column
  geom_bar(stat = "identity", width = 0.85) +   # draw the bars
  scale_y_continuous(breaks = brks,   # Breaks
                     labels = lbls) + # Labels
  coord_flip() +  # Flip axes
  labs(title="", 
       subtitle="", 
       caption="") +
  theme_tufte() +  # Tufte theme from ggthemes
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        plot.title = element_text(hjust = .5), 
        axis.ticks = element_blank(), axis.text.x=element_text(size=10, colour = "dark blue"), axis.text.y=element_text(size=20, colour = "black"), axis.title=element_text(size=15),legend.text=element_text(size=15)) +
  scale_fill_brewer(palette = "Dark2") + # Color palette
  geom_line(aes(0, 0)) + geom_hline(yintercept=0, linetype="dashed", color = "black", size=1)

bar_after <- bar_after + rremove("legend")

# Test proportions
#**********************************************************************************************
## 32F1
epi_32 <- length(sample_df_32F1$IES_ID[sample_df_32F1$DCL2_3_SENSITIVE_BIOMART == "True" |
                                         sample_df_32F1$DCL5_SENSITIVE_BIOMART == "True"])

epi_ctr_32 <- length(ctr_df_32$IES_ID[ctr_df_32$DCL2_3_SENSITIVE_BIOMART == "True" |
                                        ctr_df_32$DCL5_SENSITIVE_BIOMART == "True"])

trial_32 <- length(sample_df_32F1$IES_ID)
trial_ctr_32 <- length(ctr_df_32$IES_ID)

##25F1
epi_25 <- length(sample_df_25F1$IES_ID[sample_df_25F1$DCL2_3_SENSITIVE_BIOMART == "True" |
                                         sample_df_25F1$DCL5_SENSITIVE_BIOMART == "True"])

epi_ctr_25 <- length(ctr_df_25$IES_ID[ctr_df_25$DCL2_3_SENSITIVE_BIOMART == "True" |
                                        ctr_df_25$DCL5_SENSITIVE_BIOMART == "True"])

trial_25 <- length(sample_df_25F1$IES_ID)
trial_ctr_25 <- length(ctr_df_25$IES_ID)

##18F1
epi_18 <- length(sample_df_18F1$IES_ID[sample_df_18F1$DCL2_3_SENSITIVE_BIOMART == "True" |
                                         sample_df_18F1$DCL5_SENSITIVE_BIOMART == "True"])

epi_ctr_18 <- length(ctr_df_18$IES_ID[ctr_df_18$DCL2_3_SENSITIVE_BIOMART == "True" |
                                        ctr_df_18$DCL5_SENSITIVE_BIOMART == "True"])

trial_18 <- length(sample_df_18F1$IES_ID)
trial_ctr_18 <- length(ctr_df_18$IES_ID)

prop.test(x = c(epi_32, epi_ctr_32), n = c(trial_32, trial_ctr_32), alternative = "greater")
prop.test(x = c(epi_25, epi_ctr_25), n = c(trial_25, trial_ctr_25), alternative = "greater")
prop.test(x = c(epi_18, epi_ctr_18), n = c(trial_18, trial_ctr_18), alternative = "greater")

#****************************************************************
# Assamble Supplemental_Fig_S4 (distributions before/after normalization shown for 32F1 only)
#****************************************************************
### IES_size
## Combine dfs_Before
ctr_df_32$set <- "Norm-CTR_32"
sample_df_32F1 <- sample_df_32F1[,-8]
combo_df_32 <- rbind(sample_df_32F1, PGM_df)
combo_df_32$TPM_25_mean_D12 <- log2(combo_df_32$TPM_25_mean_D12)

size_before_32 <- ggdensity(combo_df_32, x = "IES_LEN",
                            add = "median", rug = F, fill = "set",
                            palette = c("dark red", "gray"), 
                            xlab = "IES size [bp]", ggtheme = theme_bw(), 
                            font.legend = c(20, "plain", "black"), alpha = 0.6) +
  xlim(0, 150) +
  font("x.text", size = 10, color = "dark blue") +
  font("y.text", size = 10, color = "dark blue") +
  font("xlab", size = 15, color = "black") +
  font("ylab", size = 15, color = "black") +
  theme(axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)),
        axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)))

# rm legend
size_before_32 <- size_before_32 + rremove("legend")

## Combine dfs_After
ctr_df_32 <- ctr_df_32[,-8]
combo_df_32 <- rbind(sample_df_32F1, ctr_df_32)
combo_df_32$TPM_25_mean_D12 <- log2(combo_df_32$TPM_25_mean_D12)

size_after_32 <- ggdensity(combo_df_32, x = "IES_LEN",
                           add = "median", rug = F, fill = "set",
                           palette = c("dark red", "gray"), 
                           xlab = "IES size [bp]", ggtheme = theme_bw(), 
                           font.legend = c(20, "plain", "black"), alpha = 0.6) +
  xlim(0, 150) +
  font("x.text", size = 10, color = "dark blue") +
  font("y.text", size = 10, color = "dark blue") +
  font("xlab", size = 15, color = "black") +
  font("ylab", size = 15, color = "black") +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)))

# rm legend
size_after_32 <- size_after_32 + rremove("legend")

### TPM
## Combine dfs_Before
ctr_df_32$set <- "Norm-CTR"
combo_df_32 <- rbind(sample_df_32F1, PGM_df)
combo_df_32$TPM_25_mean_D12 <- log2(combo_df_32$TPM_25_mean_D12)

TPM_before_32 <- ggdensity(combo_df_32, x = "TPM_25_mean_D12",
                           add = "median", rug = F, fill = "set",
                           palette = c("dark red", "gray"), 
                           xlab = "expression [Log2(TPM)]", ggtheme = theme_bw(), 
                           font.legend = c(20, "plain", "black"), alpha = 0.6) +
  font("x.text", size = 10, color = "dark blue") +
  font("y.text", size = 10, color = "dark blue") +
  font("xlab", size = 15, color = "black") +
  font("ylab", size = 15, color = "black") +
  theme(axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)),
        axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)))

# rm legend
TPM_before_32 <- TPM_before_32 + rremove("legend")

## Combine dfs_After
combo_df_32 <- rbind(sample_df_32F1, ctr_df_32)
combo_df_32$TPM_25_mean_D12 <- log2(combo_df_32$TPM_25_mean_D12)

TPM_after_32 <- ggdensity(combo_df_32, x = "TPM_25_mean_D12",
                          add = "median", rug = F, fill = "set",
                          palette = c("dark red", "gray"), 
                          xlab = "expression [Log2(TPM)]", ggtheme = theme_bw(), 
                          font.legend = c(20, "plain", "black"), alpha = 0.6) +
  font("x.text", size = 10, color = "dark blue") +
  font("y.text", size = 10, color = "dark blue") +
  font("xlab", size = 15, color = "black") +
  font("ylab", size = 15, color = "black") +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)))

# rm legend
TPM_after_32 <- TPM_after_32 + rremove("legend")

### Cin
## Combine dfs_Before
ctr_df_32$set <- "Norm-CTR"
combo_df_32 <- rbind(sample_df_32F1, PGM_df)
combo_df_32$TPM_25_mean_D12 <- log2(combo_df_32$TPM_25_mean_D12)

Cin_before_32 <- ggdensity(combo_df_32, x = "CIN_SCORE",
                           add = "mean", rug = F, fill = "set",
                           palette = c("dark red", "gray"), 
                           xlab = "signal strength [Cin score]", ggtheme = theme_bw(), 
                           font.legend = c(20, "plain", "black"), alpha = 0.6) +
  font("x.text", size = 10, color = "dark blue") +
  font("y.text", size = 10, color = "dark blue") +
  font("xlab", size = 15, color = "black") +
  font("ylab", size = 15, color = "black") +
  theme(axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)),
        axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)))

# rm legend
Cin_before_32 <- Cin_before_32 + rremove("legend")

## Combine dfs_After
combo_df_32 <- rbind(sample_df_32F1, ctr_df_32)
combo_df_32$TPM_25_mean_D12 <- log2(combo_df_32$TPM_25_mean_D12)

Cin_after_32 <- ggdensity(combo_df_32, x = "CIN_SCORE",
                          add = "mean", rug = F, fill = "set",
                          palette = c("dark red", "gray"), 
                          xlab = "signal strength [Cin score]", ggtheme = theme_bw(), 
                          font.legend = c(20, "plain", "black"), alpha = 0.6) +
  font("x.text", size = 10, color = "dark blue") +
  font("y.text", size = 10, color = "dark blue") +
  font("xlab", size = 15, color = "black") +
  font("ylab", size = 15, color = "black") +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)))

# rm legend
Cin_after_32 <- Cin_after_32 + rremove("legend")

ggarrange(size_before_32, size_after_32, 
          TPM_before_32, TPM_after_32, 
          Cin_before_32, Cin_after_32, 
          bar_before, bar_after, 
          ncol = 2, nrow = 4, heights = c(1.5, 1.5, 1.5, 2.5))
