This script is the 1st of 4 scripts meant to process data in R. This script deals only with the UNPARSED reads aligned to iso-1/dm6.
This Script is broken down into 4 main sections as follows:
Required files:
Subsection A) Load up data
library(edgeR)
## Loading required package: limma
library(RColorBrewer)
library(pheatmap)
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:VennDiagram':
##
## rotate
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(data.table)
rm(list=ls())
#set directory
#setwd("~/Desktop/Wunderlichlab/Cis_var_project/Cisvar7/Full_alig/Try9/")
#Load Data
cv7cnd<-read.table("CV_complete_countmatrix.txt", header = T, row.names = "GENEID") #Gene level data
cv7cnd<-cv7cnd[,1:48]
cv7cnd<-cv7cnd[complete.cases(cv7cnd),]
metadata<-read.table("cv7_iso1_meta.txt", header = T, row.names="Sample")
immunegenes<-read.table("immunelist_LB_2019_cat.txt", header = T, row.names = "FlybaseID")
tfs<-read.table("tf_Celniker2013_v2.txt", header = T)
con_gene<-read.table("gene_symbol_conversions4.txt", header = T)
con_gene$RefSeq<-NULL
con_gene<-unique(con_gene)
#Choose Treatment
Subsection B) User specified condition and subsetting of data.
*please note that this shows specifically for efae BUT user can specify EFAE or SMAR
#Choose Treatment
cond1<-"EFAE"
#cond1<-"SMAR"
if (cond1 == "EFAE"){
parentsii<-(metadata$Treatment=="EFAE")| (metadata$Treatment=="CO2")
sampset<-rownames(metadata)[parentsii]
GT<-factor(c(metadata$genotype)[parentsii])
BC<-factor(c(metadata$Batch)[parentsii])
TR<-factor(c(metadata$Treatment)[parentsii])
TH<-as.integer(12)
} else if (cond1 == "SMAR"){
parentsii<-(metadata$Treatment=="SMAR") | (metadata$Treatment=="CO2")
sampset<-rownames(metadata)[parentsii]
GT<-factor(c(metadata$genotype)[parentsii])
BC<-factor(c(metadata$Batch)[parentsii])
TR<-factor(c(metadata$Treatment)[parentsii])
TH<-as.integer(12)
} else{
print("I'm afraid I cant do that Dave.")
}
cv7cnc<-cv7cnd[,names(cv7cnd) %in% sampset]
cv7cnc<-setcolorder(cv7cnc, sampset)
Subsection C) Determine differentially expressed genes between Control and Treatment conditions
This step does not consider Genotype that will be factored in in 1.D - 1.F
####DE Control vs Treated ####
y <- DGEList(counts=cv7cnc, genes=rownames(cv7cnc))
#filter out low expressed genes. In this case the threshold is n/2 ( a loose threshold)
isexpr <- rowSums(cpm(y)>1) >= TH
dim(y)
## [1] 13922 32
y <- y[isexpr, , keep.lib.sizes=FALSE] ##to remove low expressed stuff
y <- calcNormFactors(y)
dim(y)
## [1] 11037 32
#assign genotype
data.frame(Sample=colnames(y), TR, GT,BC)
## Sample TR GT BC
## 1 B18_209_iso1 1 1 1
## 2 B18_216_iso1 1 1 2
## 3 B18_222_iso1 1 1 3
## 4 B19_087_iso1 1 1 8
## 5 B18_198_iso1 1 3 0
## 6 B18_217_iso1 1 3 2
## 7 B18_224_iso1 1 3 3
## 8 B19_085_iso1 1 3 8
## 9 B18_231_iso1 1 2 4
## 10 B18_233_iso1 1 2 4
## 11 B19_086_iso1 1 2 8
## 12 B19_093_iso1 1 2 9
## 13 B18_200_iso1 1 4 0
## 14 B18_230_iso1 1 4 4
## 15 B18_232_iso1 1 4 4
## 16 B19_092_iso1 1 4 9
## 17 B19_075_iso1 2 1 6
## 18 B19_082_iso1 2 1 7
## 19 B19_091_iso1 2 1 8
## 20 B19_095_iso1 2 1 9
## 21 B19_076_iso1 2 3 6
## 22 B19_080_iso1 2 3 7
## 23 B19_088_iso1 2 3 8
## 24 B19_096_iso1 2 3 9
## 25 B19_078_iso1 2 2 6
## 26 B19_081_iso1 2 2 7
## 27 B19_090_iso1 2 2 8
## 28 B19_097_iso1 2 2 9
## 29 B19_077_iso1 2 4 6
## 30 B19_079_iso1 2 4 7
## 31 B19_089_iso1 2 4 8
## 32 B19_094_iso1 2 4 9
design <- model.matrix(~TR+GT+BC)
#data.frame(Sample=colnames(y), TR,GT)
#design <- model.matrix(~TR+GT)
#data.frame(Sample=colnames(y), TR, BC)
#design <- model.matrix(~TR+BC)
rownames(design) <- colnames(y)
design
## (Intercept) TR2 GT2 GT3 GT4 BC1 BC2 BC3 BC4 BC6 BC7 BC8 BC9
## B18_209_iso1 1 0 0 0 0 1 0 0 0 0 0 0 0
## B18_216_iso1 1 0 0 0 0 0 1 0 0 0 0 0 0
## B18_222_iso1 1 0 0 0 0 0 0 1 0 0 0 0 0
## B19_087_iso1 1 0 0 0 0 0 0 0 0 0 0 1 0
## B18_198_iso1 1 0 0 1 0 0 0 0 0 0 0 0 0
## B18_217_iso1 1 0 0 1 0 0 1 0 0 0 0 0 0
## B18_224_iso1 1 0 0 1 0 0 0 1 0 0 0 0 0
## B19_085_iso1 1 0 0 1 0 0 0 0 0 0 0 1 0
## B18_231_iso1 1 0 1 0 0 0 0 0 1 0 0 0 0
## B18_233_iso1 1 0 1 0 0 0 0 0 1 0 0 0 0
## B19_086_iso1 1 0 1 0 0 0 0 0 0 0 0 1 0
## B19_093_iso1 1 0 1 0 0 0 0 0 0 0 0 0 1
## B18_200_iso1 1 0 0 0 1 0 0 0 0 0 0 0 0
## B18_230_iso1 1 0 0 0 1 0 0 0 1 0 0 0 0
## B18_232_iso1 1 0 0 0 1 0 0 0 1 0 0 0 0
## B19_092_iso1 1 0 0 0 1 0 0 0 0 0 0 0 1
## B19_075_iso1 1 1 0 0 0 0 0 0 0 1 0 0 0
## B19_082_iso1 1 1 0 0 0 0 0 0 0 0 1 0 0
## B19_091_iso1 1 1 0 0 0 0 0 0 0 0 0 1 0
## B19_095_iso1 1 1 0 0 0 0 0 0 0 0 0 0 1
## B19_076_iso1 1 1 0 1 0 0 0 0 0 1 0 0 0
## B19_080_iso1 1 1 0 1 0 0 0 0 0 0 1 0 0
## B19_088_iso1 1 1 0 1 0 0 0 0 0 0 0 1 0
## B19_096_iso1 1 1 0 1 0 0 0 0 0 0 0 0 1
## B19_078_iso1 1 1 1 0 0 0 0 0 0 1 0 0 0
## B19_081_iso1 1 1 1 0 0 0 0 0 0 0 1 0 0
## B19_090_iso1 1 1 1 0 0 0 0 0 0 0 0 1 0
## B19_097_iso1 1 1 1 0 0 0 0 0 0 0 0 0 1
## B19_077_iso1 1 1 0 0 1 0 0 0 0 1 0 0 0
## B19_079_iso1 1 1 0 0 1 0 0 0 0 0 1 0 0
## B19_089_iso1 1 1 0 0 1 0 0 0 0 0 0 1 0
## B19_094_iso1 1 1 0 0 1 0 0 0 0 0 0 0 1
## attr(,"assign")
## [1] 0 1 2 2 2 3 3 3 3 3 3 3 3
## attr(,"contrasts")
## attr(,"contrasts")$TR
## [1] "contr.treatment"
##
## attr(,"contrasts")$GT
## [1] "contr.treatment"
##
## attr(,"contrasts")$BC
## [1] "contr.treatment"
y <- estimateDisp(y, design, robust=TRUE)
y$common.dispersion
## [1] 0.1592691
fit <- glmFit(y, design)
treat_lrt <- glmLRT(fit, coef=2)
DE_treat <- treat_lrt$table
FDR <- p.adjust(DE_treat$PValue, method="fdr")
DE_treat <- cbind(DE_treat, FDR, y$genes$genes)
colnames(DE_treat)[6] <- "gene"
treat_de.genes <- DE_treat[DE_treat$FDR <= 0.05, ] ## 1200 genes without GT data and 1500 genes with GT
Subsection D) Determine differentially expressed genes BETWEEN genotypes
Only looking in the CONTROL group (all detected genes are considered).
#####DE CONTROL all genes#####
conf<-(metadata$Treatment=="CO2" & metadata$genotype=="A4") | (metadata$Treatment=="CO2" & metadata$genotype=="B6")
confset<-rownames(metadata)[conf]
GT<-factor(c(metadata$genotype)[conf])
BC<-factor(c(metadata$Batch)[conf])
cv7cnf<-cv7cnc[names(cv7cnc) %in% confset]
f <- DGEList(counts=cv7cnf, genes=rownames(cv7cnf))
#filter out low expressed genes. In this case the threshold is n/2 ( a loose threshold)
dim(f)
## [1] 13922 8
#assign genotype
## extraction batch
data.frame(Sample=colnames(f), GT,BC)
## Sample GT BC
## 1 B18_209_iso1 1 1
## 2 B18_216_iso1 1 2
## 3 B18_222_iso1 1 3
## 4 B19_087_iso1 1 8
## 5 B18_198_iso1 3 0
## 6 B18_217_iso1 3 2
## 7 B18_224_iso1 3 3
## 8 B19_085_iso1 3 8
design <- model.matrix(~GT+BC)
rownames(design) <- colnames(f)
design
## (Intercept) GT3 BC1 BC2 BC3 BC8
## B18_209_iso1 1 0 1 0 0 0
## B18_216_iso1 1 0 0 1 0 0
## B18_222_iso1 1 0 0 0 1 0
## B19_087_iso1 1 0 0 0 0 1
## B18_198_iso1 1 1 0 0 0 0
## B18_217_iso1 1 1 0 1 0 0
## B18_224_iso1 1 1 0 0 1 0
## B19_085_iso1 1 1 0 0 0 1
## attr(,"assign")
## [1] 0 1 2 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$GT
## [1] "contr.treatment"
##
## attr(,"contrasts")$BC
## [1] "contr.treatment"
f <- estimateDisp(f, design, robust=TRUE)
f$common.dispersion
## [1] 0.3334487
fit <- glmFit(f, design)
cF_lrt <- glmLRT(fit, coef=2)
DE_cf <- cF_lrt$table
FDR <- p.adjust(DE_cf$PValue, method="fdr")
DE_cf <- cbind(DE_cf, FDR, f$genes$genes)
colnames(DE_cf)[6] <- "gene"
cf_de.genes <- DE_cf[DE_cf$FDR <= 0.05, ]
detfs<-cf_de.genes[rownames(cf_de.genes) %in% tfs$FlybaseID,]
cfimmu<-immunegenes[rownames(immunegenes) %in% rownames(cf_de.genes),]
cfde_tfimm<-merge(cf_de.genes, tfs, by.x=0, by.y=1, all.x=T)
cfde_tfimm<-merge(cfde_tfimm, immunegenes, by.x=1, by.y=0, all.x=T)
cfde_tfimm$kfun<-is.na(cfde_tfimm$gene.y)==F | is.na(cfde_tfimm$Symbol)==F
#write.table(cfde_tfimm, "pts.txt", row.names = F,quote = F, sep = "\t" )
Subsection E) Determine Differentially expressed genes BETWEEN genotypes but only in the CONTROL group.
This section only considers genes that that where shown to be differentially expressed in Section 1.C. File from this section is then used in Script 2.
#####DE control TvC DE genes #####
cv7cnz<-cv7cnc[rownames(cv7cnc) %in% treat_de.genes$gene,]
conab<-(metadata$Treatment=="CO2" & metadata$genotype=="A4") | (metadata$Treatment=="CO2" & metadata$genotype=="B6")
consetset<-rownames(metadata)[conab]
GT<-factor(c(metadata$genotype)[conab])
BC<-factor(c(metadata$Batch)[conab])
cv7cnz<-cv7cnz[names(cv7cnz) %in% consetset]
z <- DGEList(counts=cv7cnz, genes=rownames(cv7cnz))
#filter out low expressed genes. In this case the threshold is n/2 ( a loose threshold)
dim(z)
## [1] 1165 8
#assign genotype
## extraction batch
data.frame(Sample=colnames(z), GT,BC)
## Sample GT BC
## 1 B18_209_iso1 1 1
## 2 B18_216_iso1 1 2
## 3 B18_222_iso1 1 3
## 4 B19_087_iso1 1 8
## 5 B18_198_iso1 3 0
## 6 B18_217_iso1 3 2
## 7 B18_224_iso1 3 3
## 8 B19_085_iso1 3 8
design <- model.matrix(~GT+BC)
rownames(design) <- colnames(z)
design
## (Intercept) GT3 BC1 BC2 BC3 BC8
## B18_209_iso1 1 0 1 0 0 0
## B18_216_iso1 1 0 0 1 0 0
## B18_222_iso1 1 0 0 0 1 0
## B19_087_iso1 1 0 0 0 0 1
## B18_198_iso1 1 1 0 0 0 0
## B18_217_iso1 1 1 0 1 0 0
## B18_224_iso1 1 1 0 0 1 0
## B19_085_iso1 1 1 0 0 0 1
## attr(,"assign")
## [1] 0 1 2 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$GT
## [1] "contr.treatment"
##
## attr(,"contrasts")$BC
## [1] "contr.treatment"
z <- estimateDisp(z, design, robust=TRUE)
z$common.dispersion
## [1] 0.1979067
fit <- glmFit(z, design)
cAB_lrt <- glmLRT(fit, coef=2)
DE_cab <- cAB_lrt$table
FDR <- p.adjust(DE_cab$PValue, method="fdr")
DE_cab <- cbind(DE_cab, FDR, z$genes$genes)
colnames(DE_cab)[6] <- "gene"
cab_de.genes <- DE_cab[DE_cab$FDR <= 0.05, ]
cabimmu<-immunegenes[rownames(immunegenes) %in% rownames(cab_de.genes),]
cf_genes2toss<-rownames(cf_de.genes)
#write.table(cf_genes2toss, "co2_fullDEgenes.txt", quote = F, row.names = F)
Subsection F) Determine genes differentially expressed BETWEEN genotypes only in TREATED samples.
Only considers differentially expressed genes determined in Section 1.C.
#######DE Treated AB#####
cv7cnyz<-cv7cnc[rownames(cv7cnc) %in% treat_de.genes$gene,]
if (cond1 == "EFAE"){
treated<-(metadata$Treatment=="EFAE" & metadata$genotype=="A4") | (metadata$Treatment=="EFAE" & metadata$genotype=="B6")
tset<-rownames(metadata)[treated]
GT<-factor(c(metadata$genotype)[treated])
BC<-factor(c(metadata$Batch)[treated])
cv7cnyz<-cv7cnyz[names(cv7cnyz) %in% tset]
} else if (cond1 == "SMAR"){
treated<-(metadata$Treatment=="SMAR" & metadata$genotype=="A4") | (metadata$Treatment=="SMAR" & metadata$genotype=="B6")
tset<-rownames(metadata)[treated]
GT<-factor(c(metadata$genotype)[treated])
BC<-factor(c(metadata$Batch)[treated])
cv7cnyz<-cv7cnyz[names(cv7cnyz) %in% tset]
}
cv7cnyz<-cv7cnyz[names(cv7cnyz) %in% tset]
yz <- DGEList(counts=cv7cnyz, genes=rownames(cv7cnyz))
#filter out low expressed genes. In this case the threshold is n/2 ( a loose threshold)
dim(yz)
## [1] 1165 8
#assign genotype
data.frame(Sample=colnames(yz), GT,BC)
## Sample GT BC
## 1 B19_075_iso1 1 6
## 2 B19_082_iso1 1 7
## 3 B19_091_iso1 1 8
## 4 B19_095_iso1 1 9
## 5 B19_076_iso1 3 6
## 6 B19_080_iso1 3 7
## 7 B19_088_iso1 3 8
## 8 B19_096_iso1 3 9
design <- model.matrix(~GT+BC)
rownames(design) <- colnames(yz)
design
## (Intercept) GT3 BC7 BC8 BC9
## B19_075_iso1 1 0 0 0 0
## B19_082_iso1 1 0 1 0 0
## B19_091_iso1 1 0 0 1 0
## B19_095_iso1 1 0 0 0 1
## B19_076_iso1 1 1 0 0 0
## B19_080_iso1 1 1 1 0 0
## B19_088_iso1 1 1 0 1 0
## B19_096_iso1 1 1 0 0 1
## attr(,"assign")
## [1] 0 1 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$GT
## [1] "contr.treatment"
##
## attr(,"contrasts")$BC
## [1] "contr.treatment"
yz <- estimateDisp(yz, design, robust=TRUE)
yz$common.dispersion
## [1] 0.1209556
fit <- glmFit(yz, design)
sAB_lrt <- glmLRT(fit, coef=2)
DE_sab <- sAB_lrt$table
FDR <- p.adjust(DE_sab$PValue, method="fdr")
DE_sab <- cbind(DE_sab, FDR, yz$genes$genes)
colnames(DE_sab)[6] <- "gene"
sab_de.genes <- DE_sab[DE_sab$FDR <= 0.05, ]
#write.table(treat_de.genes, "cse_smaronly_DE.txt", col.names = T, quote= F, sep = '\t')
if (cond1 == "EFAE"){
#write.table(treat_de.genes, "cse_efaeonly_DE_fdr05.txt", col.names = T, quote= F, sep = '\t')
} else if (cond1 == "SMAR"){
#write.table(treat_de.genes, "cse_smaronly_DE_fdr05.txt", col.names = T, quote= F, sep = '\t')
}
## NULL
Subsection G) Combining Data from sections 1.c-1.f and Organize/sort
######COMBINE DATA ####
treat_de.genes$DE_cvt<-TRUE
DE_cab$DE_c<-DE_cab$FDR <= 0.05
DE_sab$DE_s<-DE_sab$FDR <= 0.05
genes1<- DE_cab$DE_c == FALSE & DE_sab$DE_s == FALSE
genesset1 <- row.names(treat_de.genes)[genes1]
sortindex <- order(treat_de.genes$logFC[genes1])
genesset1 <- genesset1[sortindex] #put genes1 in order of logFC between control and treated
genes2a<-DE_cab$DE_c == TRUE & DE_sab$DE_s == FALSE & treat_de.genes$logFC > 0
genesset2a <- row.names(treat_de.genes)[genes2a]
sortindex <- order(DE_cab$logFC[genes2a])
genesset2a <- genesset2a[sortindex] #put genes2a in order of logFC between parents in control
genes2b<-DE_cab$DE_c == TRUE & DE_sab$DE_s == FALSE & treat_de.genes$logFC < 0
genesset2b <- row.names(treat_de.genes)[genes2b]
sortindex <- order(DE_cab$logFC[genes2b])
genesset2b <- genesset2b[sortindex] #put genes2a in order of logFC between parents in control
genes3a<-DE_cab$DE_c == FALSE & DE_sab$DE_s == TRUE & treat_de.genes$logFC > 0
genesset3a <- row.names(treat_de.genes)[genes3a]
sortindex <- order(DE_sab$logFC[genes3a])
genesset3a <- genesset3a[sortindex] #put genes2a in order of logFC between parents in treated
genes3b<-DE_cab$DE_c == FALSE & DE_sab$DE_s == TRUE & treat_de.genes$logFC < 0
genesset3b <- row.names(treat_de.genes)[genes3b]
sortindex <- order(DE_sab$logFC[genes3b])
genesset3b <- genesset3b[sortindex] #put genes2a in order of logFC between parents in treated
genes4a<-DE_cab$DE_c == TRUE & DE_sab$DE_s == TRUE & treat_de.genes$logFC > 0
genesset4a <- row.names(treat_de.genes)[genes4a]
sortindex <- order(DE_sab$logFC[genes4a])
genesset4a <- genesset4a[sortindex] #put genes2a in order of logFC between parents in control and treated
genes4b<-DE_cab$DE_c == TRUE & DE_sab$DE_s == TRUE & treat_de.genes$logFC < 0
genesset4b <- row.names(treat_de.genes)[genes4b]
sortindex <- order(DE_sab$logFC[genes4b])
genesset4b <- genesset4b[sortindex] #put genes2a in order of logFC between parents in control and treated
gencat<-c(genesset1,genesset2a,genesset2b,genesset3a,genesset3b,genesset4a,genesset4b)#pheat map order
#gencat<-c(genesset4b,genesset4a,genesset3b,genesset3a,genesset2b,genesset2a,genesset1)#heatmap order
gene4ase<-c(genesset1,genesset2a,genesset2b)
genenase<-c(genesset4b,genesset4a,genesset3b,genesset3a)
g1<-as.data.frame(genesset1)
g1$Group<-"Group1"
names(g1)[1]<-"GeneID"
g2<-as.data.frame(append(genesset2a, genesset2b))
g2$Group<-"Group2"
names(g2)[1]<-"GeneID"
g3<-as.data.frame(append(genesset3a, genesset3b))
g3$Group<-"Group3"
names(g3)[1]<-"GeneID"
g4<-as.data.frame(append(genesset4a, genesset4b))
g4$Group<-"Group4"
names(g4)[1]<-"GeneID"
g1234<-rbind(g1,g2)
g1234<-rbind(g1234,g3)
g1234<-rbind(g1234,g4)
g234<-rbind(g2,g3)
g234<-rbind(g234,g4)
g1im<-immunegenes[rownames(immunegenes) %in% g1$GeneID,]
g2im<-immunegenes[rownames(immunegenes) %in% g2$GeneID,]
g3im<-immunegenes[rownames(immunegenes) %in% g3$GeneID,]
g4im<-immunegenes[rownames(immunegenes) %in% g4$GeneID,]
###########Make a HEat map!########
testcpm<-as.data.frame(cpm(y))
testcpm<-testcpm[match(gencat, rownames(testcpm)),]
parents<-append(consetset, tset)
testcpm<-testcpm[,match(parents, names(testcpm))]
testcpm<-testcpm+.01
a4co2<-testcpm[,colnames(testcpm) %in% rownames(metadata[metadata$Treatment=="CO2" & metadata$genotype=="A4",])]
b6co2<-testcpm[,colnames(testcpm) %in% rownames(metadata[metadata$Treatment=="CO2" & metadata$genotype=="B6",])]
a4treat<-testcpm[,colnames(testcpm) %in% rownames(metadata[metadata$Treatment==cond1 & metadata$genotype=="A4",])]
b6treat<-testcpm[,colnames(testcpm) %in% rownames(metadata[metadata$Treatment==cond1 & metadata$genotype=="B6",])]
avelist<-list(a4co2,b6co2,a4treat,b6treat)
avelist<-lapply(avelist, function(x) cbind(x, ave=rowMeans(x)))
avecpm<-as.data.frame(cbind(avelist[[1]]$ave,avelist[[2]]$ave,avelist[[3]]$ave,avelist[[4]]$ave))
colnames(avecpm)<-c("A4Control","B6Control","A4Treated","B6Treated")
avecpm<-as.matrix(avecpm)
testcpm2<-as.matrix(testcpm)
##log transforming, centering and scaling
testcpm2<-log2(testcpm2)
center_scale <- function(x) {
scale(x, center=T)
}
testcpm2<-center_scale(testcpm2)
avecpm<-log2(avecpm)
avecpm<-center_scale(avecpm)
rownames(avecpm)<-rownames(testcpm)
rownames(g1234)<-g1234$GeneID
genegroups<-g1234[c("Group")]
genesub<-c(genesset2a,genesset2b,genesset3a,genesset3b,genesset4a,genesset4b)
testcpm3<-testcpm2[rownames(testcpm2) %in% genesub,]
genesub2<-c(genesset2a,genesset2b)
testcpm4<-testcpm2[rownames(testcpm2) %in% genesub2,]
genesub3<-c(genesset3a,genesset3b)
testcpm5<-testcpm2[rownames(testcpm2) %in% genesub3,]
genesub4<-c(genesset4a,genesset4b)
testcpm6<-testcpm2[rownames(testcpm2) %in% genesub4,]
###Gabbies heatmap code
col <- colorRampPalette(brewer.pal(11, "RdBu"))(10)#not sure what the 256 does
col2<-rev(col)
if (cond1 == "EFAE"){
Figure1A_efae<-pheatmap(avecpm, scale="row",
show_rownames = F,
legend = TRUE,
color=col2,
cluster_rows = F,
annotation_row = genegroups,
cluster_cols=F, main = "E. faecalis ")
write.table(gencat, "immgenes_efaefdr05.txt", quote = F, row.names = F)
write.table(g1234, "efae_imgcatfdr05.txt", quote = F, row.names = F)
} else if (cond1 == "SMAR"){
Figure1A_smar<-pheatmap(avecpm, scale="row", show_rownames = F,
legend = TRUE, color=col2, cluster_rows = F,annotation_row = genegroups,
cluster_cols=F, main = "S. marcescens")
write.table(gencat, "immgenes_smarfdr05.txt", quote = F, row.names = F)
write.table(g1234, "smar_imgcatfdr05.txt", quote = F, row.names = F)
} else{
print("I'm afraid I cant do that Dave.")
}
## Baloon plot comment
groupnames<-c("Group1","Group2","Group3","Group4")
tempefae<-as.numeric()
tempsmar<-as.numeric()
totsefae<-read.table("efae_imgcatfdr05.txt", header=T)
totssmar<-read.table("smar_imgcatfdr05.txt", header=T)
for (i in 1:length(groupnames)){
tempefae<-append(tempefae, nrow(totsefae[totsefae$Group==groupnames[i],]))
tempsmar<-append(tempsmar, nrow(totssmar[totssmar$Group==groupnames[i],]))
}
balloons<-as.data.frame(cbind(groupnames,tempefae,tempsmar))
row.names(balloons)<-c("No Effect", "Control Only", "Treated Only","Both")
balloons$tempefae<-as.numeric(as.character(balloons$tempefae))
balloons$tempsmar<-as.numeric(as.character(balloons$tempsmar))
balloons$groupnames<-NULL
colnames(balloons)<-c("E. faecalis", "S. marcescens")
figure1B<-ggballoonplot(balloons,size.range = c(7, 20), show.label = TRUE)+
theme_bw()+
ggtitle("Genotype Effects on Differentially Expressed Genes")+
ylab("Geneotype Effect") +
xlab("Infection")+
scale_fill_viridis_c(option = "C")
figure1B
######Immune response E vs S comparison ######
smarfull<-read.table("immgenes_smarfdr05.txt", header = T)
efaefull<-read.table("immgenes_efaefdr05.txt",header=T)
names(smarfull)[1]<-"smar"
smarfull$smarpresent<-TRUE
names(efaefull)[1]<-"efae"
efaefull$efaepresent<-TRUE
immune_comp<-merge(efaefull, smarfull, by=1, all= T)
immune_comp[is.na(immune_comp)]<-"FALSE"
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == TRUE) ##shared
## [1] 606
sum(immune_comp$efaepresent == FALSE & immune_comp$smarpresent == TRUE) ##SMAR specific
## [1] 597
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == FALSE) ## efae specific
## [1] 559
shared<-immune_comp[immune_comp$efaepresent == TRUE & immune_comp$smarpresent == TRUE,]
g1im<-g1im[!rownames(g1im) %in% shared$efae,]
g2im<-g2im[!rownames(g2im) %in% shared$efae,]
g3im<-g3im[!rownames(g3im) %in% shared$efae,]
g4im<-g4im[!rownames(g4im) %in% shared$efae,]
###########HOWMANY ARE IMMUNE GENES#########
immunegenes$present<-TRUE
immune_comp<-merge(immune_comp, immunegenes, by.x=1, by.y=0, all.x=T)
immune_comp[is.na(immune_comp)]<-"FALSE" ##ignore the error messages child
## Warning in `[<-.factor`(`*tmp*`, thisvar, value = "FALSE"): invalid factor
## level, NA generated
## Warning in `[<-.factor`(`*tmp*`, thisvar, value = "FALSE"): invalid factor
## level, NA generated
## Warning in `[<-.factor`(`*tmp*`, thisvar, value = "FALSE"): invalid factor
## level, NA generated
## Warning in `[<-.factor`(`*tmp*`, thisvar, value = "FALSE"): invalid factor
## level, NA generated
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == TRUE & immune_comp$present == TRUE) ##shared
## [1] 164
sum(immune_comp$efaepresent == FALSE & immune_comp$smarpresent == TRUE & immune_comp$present == TRUE) ##SMAR specific
## [1] 98
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == FALSE & immune_comp$present == TRUE) ## efae specific
## [1] 39
immunecore<-immune_comp[immune_comp$present==TRUE,]
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == TRUE & immune_comp$Core == TRUE) ##shared
## [1] 101
sum(immune_comp$efaepresent == FALSE & immune_comp$smarpresent == TRUE & immune_comp$Core == TRUE) ##SMAR specific
## [1] 44
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == FALSE & immune_comp$Core == TRUE) ## efae specific
## [1] 19
immunecorecore<-immunecore[immunecore$efaepresent == TRUE & immunecore$smarpresent == TRUE & immunecore$Core == TRUE,]
#2) looking for immune or TFs in the differentially expressed stuff
efaetreat<-read.table("immgenes_efaefdr05.txt",header=T)
smartreat<-read.table("immgenes_smarfdr05.txt",header=T)
if (cond1=="EFAE"){
it_labs<-treat_de.genes[rownames(treat_de.genes) %in% efaetreat$x,]
} else if (cond1=="SMAR"){
it_labs<-treat_de.genes[rownames(treat_de.genes) %in% smartreat$x,]
} else{
print("I'm afraid i cant do that Dave" )
}
it_labs<-merge(it_labs, con_gene, by.x=0, by.y=2, all.x=T)
it_labs<-merge(it_labs, tfs, by.x=1, by.y=1, all.x=T)
it_labs<-merge(it_labs, immunegenes, by.x=1, by.y=0, all.x=T)
it_labs<-merge(it_labs, g1234, by=1, all.x=T)
#3) looking at the total genes between treated and control
totgenes<-merge(efaetreat, smartreat, by=1, all=T)
#write.table(totgenes, "controlvstreatedDE.txt", quote = F, row.names = F, col.names = F)
test4<-y$genes
test4<-test4[! test4$genes %in% totgenes,]
#write.table(test4, "detectedgenes.txt", quote = F, row.names = F, col.names = F)
Comparing genes showing genotype effects between treatment types
Test1) Text: “Higher fraction of Smar-responsive genes show genotype effects prior to infection than Efae-responsive genes.” So we are looking at proportions of Group1+group3 / remaining genes of Efae vs Smar. Gonna do a two proportions T test using “a two-dimensional table (or matrix) with 2 columns, giving the counts of successes and failures, respectively.”
##
success<-c(67,172)
trails<-c(1165,1203)
test1a<-prop.test(success,trails)
2*test1a$p.value # Bonferoni correction ( we only do 2 tests)
## [1] 1.651636e-11
##sanity check you get the exact same P value as doing this
efaegenes<-c(67,1098)
smargenes<-c(172,1031)
testtable1a<-as.matrix(rbind(efaegenes,smargenes))
test1a<-prop.test(success,trails)
2*test1a$p.value
## [1] 1.651636e-11
#for some reason when you provide x and n as vectors you give the total successes vs total trials BUT when X is a 2x2 matrix you give successes and failures so just keep that in mind.
#However i find the first version a little bit more intuitive to follow along with cause you can see the numbers
Test2 ) Text: “higher fraction of Efae-responsive genes show genotype effects after infection.” So we are looking at proportions of Group2 / remaining genes of Efae vs Smar.
##
success<-c(433,91)
trails<-c(1165,1203)
test1b<-prop.test(success,trails)
2*test1b$p.value # Bonferoni correction ( we only do 2 tests)
## [1] 9.48361e-67
Comparing average CPM Values between treated and control genes.
#clean up our envirnment
rm(list=ls())
#function for later
RowVar <- function(x, ...) {
rowSums((x - rowMeans(x, ...))^2, ...)/(dim(x)[2] - 1)
}
#reload needed data
cv7cnd<-read.table("CV_complete_countmatrix.txt", header = T, row.names = "GENEID") #Gene level data
cv7cnd<-cv7cnd[complete.cases(cv7cnd),]
metadata<-read.table("cv7_iso1_meta.txt", header = T, row.names="Sample")
totsefae<-read.table("efae_imgcatfdr05.txt", header=T)
totssmar<-read.table("smar_imgcatfdr05.txt", header=T)
con_gene<-read.table("gene_symbol_conversions4.txt", header = T)
con_gene$RefSeq<-NULL
con_gene<-unique(con_gene)
##filter out the data we need
tempcounts<-cv7cnd[,names(cv7cnd) %in% rownames(metadata[metadata$Generation=="f0",])]
y <- DGEList(counts=tempcounts, genes=rownames(tempcounts))
y<-calcNormFactors(y)
#filter out low expressed genes.
isexpr <- rowSums(cpm(y)>1) >= 5
y <- y[isexpr, , keep.lib.sizes=FALSE] ##to remove low expressed stuff
y <- calcNormFactors(y)
#now we can work with these values
tempcpm<-as.data.frame(cpm(y$counts))
nrow(tempcpm)
## [1] 9579
#Calculate average cpm/ gene for C02, EFAE, SMAR samples
tempcpm$CO2<-rowMeans(tempcpm[, rownames(metadata[metadata$Generation=="f0" & metadata$Treatment=="CO2",])])
tempcpm$EFAE<-rowMeans(tempcpm[, rownames(metadata[metadata$Generation=="f0" & metadata$Treatment=="EFAE",])])
tempcpm$SMAR<-rowMeans(tempcpm[, rownames(metadata[metadata$Generation=="f0" & metadata$Treatment=="SMAR",])])
# pull out the infection specific genes
tempecpm<-tempcpm[rownames(tempcpm) %in% totsefae$GeneID,c("CO2","EFAE")]
tempscpm<-tempcpm[rownames(tempcpm) %in% totssmar$GeneID,c("CO2","SMAR")]
colnames(tempecpm)[1]<-"E_CO2"
colnames(tempscpm)[1]<-"S_CO2"
tempdegcpm<-merge(tempecpm,tempscpm, by=0, all=TRUE )
tempdegcpm$Row.names<-NULL
cpmdeg<-gather(tempdegcpm,"Treatment", "AverageCPM" )
##different genotype effect groups
tempec1<-tempecpm[rownames(tempecpm) %in% totsefae[totsefae$Group=="Group1",1], ]
tempec2<-tempecpm[rownames(tempecpm) %in% totsefae[totsefae$Group=="Group2",1], ]
tempec3<-tempecpm[rownames(tempecpm) %in% totsefae[totsefae$Group=="Group3",1], ]
tempec4<-tempecpm[rownames(tempecpm) %in% totsefae[totsefae$Group=="Group4",1], ]
tempsc1<-tempscpm[rownames(tempscpm) %in% totssmar[totssmar$Group=="Group1",1], ]
tempsc2<-tempscpm[rownames(tempscpm) %in% totssmar[totssmar$Group=="Group2",1], ]
tempsc3<-tempscpm[rownames(tempscpm) %in% totssmar[totssmar$Group=="Group3",1], ]
tempsc4<-tempscpm[rownames(tempscpm) %in% totssmar[totssmar$Group=="Group4",1], ]
colnames(tempec1)<-c("g1_co2","g1efae")
colnames(tempec2)<-c("g2_co2","g2efae")
colnames(tempec3)<-c("g3_co2","g3efae")
colnames(tempec4)<-c("g4_co2","g4efae")
colnames(tempsc1)<-c("g1_co2","g1smar")
colnames(tempsc2)<-c("g2_co2","g2smar")
colnames(tempsc3)<-c("g3_co2","g3smar")
colnames(tempsc4)<-c("g4_co2","g4smar")
cpmefae<-rbind(gather(tempec1,"Treatment", "AverageCPM"),
gather(tempec2,"Treatment", "AverageCPM"),
gather(tempec3,"Treatment", "AverageCPM"),
gather(tempec4,"Treatment", "AverageCPM"))
cpmsmar<-rbind(gather(tempsc1,"Treatment", "AverageCPM"),
gather(tempsc2,"Treatment", "AverageCPM"),
gather(tempsc3,"Treatment", "AverageCPM"),
gather(tempsc4,"Treatment", "AverageCPM"))
##now we plot
#make gg objects
C1deg<-ggplot(cpmdeg, aes(x=`Treatment`, y=`AverageCPM`))
C1efae<-ggplot(cpmefae, aes(x=`Treatment`, y=`AverageCPM`))
C1smar<-ggplot(cpmsmar, aes(x=`Treatment`, y=`AverageCPM`))
#first we look at all the deferentially expressed genes to infection
c1degmeans <- aggregate(AverageCPM ~ Treatment, cpmdeg, mean)
c1degmeans$AverageCPM<-round(c1degmeans$AverageCPM,2)
C1deg<- C1deg+
geom_violin(trim=FALSE)+
ggtitle("AverageCPM per Gene for Differentially Expressed Genes\n1165-Efae, 1203-Smar")+
ylim(0,250)+
geom_text(data = c1degmeans, aes(label = AverageCPM))
C1deg
## Warning: Removed 2523 rows containing non-finite values (stat_ydensity).
## Warning: Removed 433 rows containing missing values (geom_violin).
#then we look at the efae specific genes and seperate these furtehr by genotype effect groups
c1efaemeans <- aggregate(AverageCPM ~ Treatment, cpmefae, mean)
c1efaemeans$AverageCPM<-round(c1efaemeans$AverageCPM,2)
efaelabs<-c("No Genotype Effects\n(Group 1 CO2)", "No Genotype Effects\n(Group 1 EFAE)",
"Genotype Effects in Control\n(Group 2 CO2)", "Genotype Effects in Control\n(Group 2 EFAE)",
"Genotype Effects in Treated\n(Group 3 CO2)", "Genotype Effects in Treated\n(Group 3 EFAE)",
"Genotype Efects in Both\n(Group 4 CO2)","Genotype Efects in Both\n(Group 4 EFAE)")
C1efae<-C1efae+
geom_violin(trim=FALSE)+
ggtitle("AverageCPM per Gene\nEfae Differentially Expressed Genes (1165)")+
ylim(0,300)+
geom_text(data = c1efaemeans, aes(label = AverageCPM))+
scale_x_discrete(labels=efaelabs)+
theme(axis.text.x=element_text(angle=45, hjust=1))
C1efae
## Warning: Removed 208 rows containing non-finite values (stat_ydensity).
## Warning: Removed 904 rows containing missing values (geom_violin).
#then we look at the smar specific genes and seperate these furtehr by genotype effect groups
c1smarmeans <- aggregate(AverageCPM ~ Treatment, cpmsmar, mean)
c1smarmeans$AverageCPM<-round(c1smarmeans$AverageCPM,2)
smarlabs<-c("No Genotype Effects\n(Group 1 CO2)", "No Genotype Effects\n(Group 1 SMAR)",
"Genotype Effects in Control\n(Group 2 CO2)", "Genotype Effects in Control\n(Group 2 SMAR)",
"Genotype Effects in Treated\n(Group 3 CO2)", "Genotype Effects in Treated\n(Group 3 SMAR)",
"Genotype Efects in Both\n(Group 4 CO2)","Genotype Efects in Both\n(Group 4 SMAR)")
C1smar<- C1smar+
geom_violin(trim=FALSE)+
ggtitle("AverageCPM per Gene\nSmar Differentially Expressed Genes (1203)")+
ylim(0,500)+
geom_text(data = c1smarmeans, aes(label = AverageCPM))+
scale_x_discrete(labels=smarlabs)+
theme(axis.text.x=element_text(angle=45, hjust=1))
C1smar
## Warning: Removed 176 rows containing non-finite values (stat_ydensity).
## Warning: Removed 898 rows containing missing values (geom_violin).
## Warning: Removed 1 rows containing missing values (geom_text).
#Pvalue tests
#first we test for significance in the full set of genes
efaeC1deg<-wilcox.test(cpmdeg[cpmdeg$Treatment=="E_CO2",2], cpmdeg[cpmdeg$Treatment=="EFAE",2],paired=FALSE,aternative="two.sided")
smarC1deg<-wilcox.test(cpmdeg[cpmdeg$Treatment=="S_CO2",2],cpmdeg[cpmdeg$Treatment=="SMAR",2],paired=FALSE,aternative="two.sided")
##Next we want to test for significant difference between treated and control within the genotype effetc groups
tempgroups<-c("g1","g2","g3","g4")
efaesig<-as.numeric()
smarsig<-as.numeric()
#loop for performing wilcoxon rank sum test on each genotype effect groups ( testing between control and treated for each group)
for (i in 1:length(tempgroups)){
#subset efae data
tempeco2<-cpmefae[cpmefae$Treatment==paste0(tempgroups[i],"_co2"),"AverageCPM"]
tempefae<-cpmefae[cpmefae$Treatment==paste0(tempgroups[i],"efae"),"AverageCPM"]
#subset smar data
tempsco2<-cpmsmar[cpmsmar$Treatment==paste0(tempgroups[i],"_co2"),"AverageCPM"]
tempsmar<-cpmsmar[cpmsmar$Treatment==paste0(tempgroups[i],"smar"),"AverageCPM"]
#test for efae
tempep<-wilcox.test(tempeco2, tempefae,paired=FALSE,aternative="two.sided")
#test for smar
tempsp<-wilcox.test(tempsco2, tempsmar,paired=FALSE,aternative="two.sided")
#add pvalues to vector
efaesig<-append(efaesig, tempep$p.value)
smarsig<-append(smarsig, tempsp$p.value)
rm(tempeco2,tempefae,tempsco2,tempsmar,tempep,tempsp)
}
sigs<-as.data.frame(cbind(tempgroups,round(efaesig,5),round(smarsig,5)))
colnames(sigs)<-c("Gene_Group","Efae_sig","Smar_sig")
#Bonferoni correction : 5 tests per each infection condition was performed (full DEG, Group1, Group2, Group3, Group4) so we multiply each pvalue by the number of tests performed
sigs$efae_corrected<-as.numeric(as.character(sigs$Efae_sig))*5
sigs$smar_corrected<-as.numeric(as.character(sigs$Smar_sig))*5
rm(list=ls(pattern="temp"))
efaeC1deg$p.value*5
## [1] 0.3376729
smarC1deg$p.value *5
## [1] 0.1571327
sigs
## Gene_Group Efae_sig Smar_sig efae_corrected smar_corrected
## 1 g1 0.0188 0.07403 0.09400 0.37015
## 2 g2 0.53497 0.64477 2.67485 3.22385
## 3 g3 0.99026 0.21635 4.95130 1.08175
## 4 g4 0.89734 0.71077 4.48670 3.55385