DE_cf <- cF_lrt$table
FDR <- p.adjust(DE_cf$PValue, method="fdr")
DE_cf <- cbind(DE_cf, FDR, f$genes$genes)
colnames(DE_cf)[6] <- "gene"
cf_de.genes <- DE_cf[DE_cf$FDR <= 0.05, ]
detfs<-cf_de.genes[rownames(cf_de.genes) %in% tfs$FlybaseID,]
cfimmu<-immunegenes[rownames(immunegenes) %in% rownames(cf_de.genes),]
cfde_tfimm<-merge(cf_de.genes, tfs, by.x=0, by.y=1, all.x=T)
cfde_tfimm<-merge(cfde_tfimm, immunegenes, by.x=1, by.y=0, all.x=T)
cfde_tfimm$kfun<-is.na(cfde_tfimm$gene.y)==F | is.na(cfde_tfimm$Symbol)==F
#write.table(cfde_tfimm, "pts.txt", row.names = F,quote = F, sep = "\t" )
#####DE control TvC DE genes #####
cv7cnz<-cv7cnc[rownames(cv7cnc) %in% treat_de.genes$gene,]
conab<-(metadata$Treatment=="CO2" & metadata$genotype=="A4") | (metadata$Treatment=="CO2" & metadata$genotype=="B6")
consetset<-rownames(metadata)[conab]
GT<-factor(c(metadata$genotype)[conab])
BC<-factor(c(metadata$Batch)[conab])
cv7cnz<-cv7cnz[names(cv7cnz) %in% consetset]
z <- DGEList(counts=cv7cnz, genes=rownames(cv7cnz))
#filter out low expressed genes. In this case the threshold is n/2 ( a loose threshold)
dim(z)
#assign genotype
## extraction batch
data.frame(Sample=colnames(z), GT,BC)
design <- model.matrix(~GT+BC)
rownames(design) <- colnames(z)
design
z <- estimateDisp(z, design, robust=TRUE)
z$common.dispersion
fit <- glmFit(z, design)
cAB_lrt <- glmLRT(fit, coef=2)
DE_cab <- cAB_lrt$table
FDR <- p.adjust(DE_cab$PValue, method="fdr")
DE_cab <- cbind(DE_cab, FDR, z$genes$genes)
colnames(DE_cab)[6] <- "gene"
cab_de.genes <- DE_cab[DE_cab$FDR <= 0.05, ]
cabimmu<-immunegenes[rownames(immunegenes) %in% rownames(cab_de.genes),]
cf_genes2toss<-rownames(cf_de.genes)
#write.table(cf_genes2toss, "co2_fullDEgenes.txt", quote = F, row.names = F)
#######DE Treated AB#####
cv7cnyz<-cv7cnc[rownames(cv7cnc) %in% treat_de.genes$gene,]
if (cond1 == "EFAE"){
treated<-(metadata$Treatment=="EFAE" & metadata$genotype=="A4") | (metadata$Treatment=="EFAE" & metadata$genotype=="B6")
tset<-rownames(metadata)[treated]
GT<-factor(c(metadata$genotype)[treated])
BC<-factor(c(metadata$Batch)[treated])
cv7cnyz<-cv7cnyz[names(cv7cnyz) %in% tset]
}  else if (cond1 == "SMAR"){
treated<-(metadata$Treatment=="SMAR" & metadata$genotype=="A4") | (metadata$Treatment=="SMAR" & metadata$genotype=="B6")
tset<-rownames(metadata)[treated]
GT<-factor(c(metadata$genotype)[treated])
BC<-factor(c(metadata$Batch)[treated])
cv7cnyz<-cv7cnyz[names(cv7cnyz) %in% tset]
}
cv7cnyz<-cv7cnyz[names(cv7cnyz) %in% tset]
yz <- DGEList(counts=cv7cnyz, genes=rownames(cv7cnyz))
#filter out low expressed genes. In this case the threshold is n/2 ( a loose threshold)
dim(yz)
#assign genotype
data.frame(Sample=colnames(yz), GT,BC)
design <- model.matrix(~GT+BC)
rownames(design) <- colnames(yz)
design
yz <- estimateDisp(yz, design, robust=TRUE)
yz$common.dispersion
fit <- glmFit(yz, design)
sAB_lrt <- glmLRT(fit, coef=2)
DE_sab <- sAB_lrt$table
FDR <- p.adjust(DE_sab$PValue, method="fdr")
DE_sab <- cbind(DE_sab, FDR, yz$genes$genes)
colnames(DE_sab)[6] <- "gene"
sab_de.genes <- DE_sab[DE_sab$FDR <= 0.05, ]
#write.table(treat_de.genes, "cse_smaronly_DE.txt", col.names = T, quote= F, sep = '\t')
if (cond1 == "EFAE"){
#write.table(treat_de.genes, "cse_efaeonly_DE_fdr05.txt", col.names = T, quote= F, sep = '\t')
}  else if (cond1 == "SMAR"){
#write.table(treat_de.genes, "cse_smaronly_DE_fdr05.txt", col.names = T, quote= F, sep = '\t')
}
######COMBINE DATA ####
treat_de.genes$DE_cvt<-TRUE
DE_cab$DE_c<-DE_cab$FDR <= 0.05
DE_sab$DE_s<-DE_sab$FDR <= 0.05
genes1<- DE_cab$DE_c == FALSE & DE_sab$DE_s == FALSE
genesset1 <- row.names(treat_de.genes)[genes1]
sortindex <- order(treat_de.genes$logFC[genes1])
genesset1 <- genesset1[sortindex] #put genes1 in order of logFC between control and treated
genes2a<-DE_cab$DE_c == TRUE & DE_sab$DE_s == FALSE & treat_de.genes$logFC > 0
genesset2a <- row.names(treat_de.genes)[genes2a]
sortindex <- order(DE_cab$logFC[genes2a])
genesset2a <- genesset2a[sortindex] #put genes2a in order of logFC between parents in control
genes2b<-DE_cab$DE_c == TRUE & DE_sab$DE_s == FALSE & treat_de.genes$logFC < 0
genesset2b <- row.names(treat_de.genes)[genes2b]
sortindex <- order(DE_cab$logFC[genes2b])
genesset2b <- genesset2b[sortindex] #put genes2a in order of logFC between parents in control
genes3a<-DE_cab$DE_c == FALSE & DE_sab$DE_s == TRUE & treat_de.genes$logFC > 0
genesset3a <- row.names(treat_de.genes)[genes3a]
sortindex <- order(DE_sab$logFC[genes3a])
genesset3a <- genesset3a[sortindex] #put genes2a in order of logFC between parents in treated
genes3b<-DE_cab$DE_c == FALSE & DE_sab$DE_s == TRUE & treat_de.genes$logFC < 0
genesset3b <- row.names(treat_de.genes)[genes3b]
sortindex <- order(DE_sab$logFC[genes3b])
genesset3b <- genesset3b[sortindex] #put genes2a in order of logFC between parents in treated
genes4a<-DE_cab$DE_c == TRUE & DE_sab$DE_s == TRUE & treat_de.genes$logFC > 0
genesset4a <- row.names(treat_de.genes)[genes4a]
sortindex <- order(DE_sab$logFC[genes4a])
genesset4a <- genesset4a[sortindex] #put genes2a in order of logFC between parents in control and treated
genes4b<-DE_cab$DE_c == TRUE & DE_sab$DE_s == TRUE & treat_de.genes$logFC < 0
genesset4b <- row.names(treat_de.genes)[genes4b]
sortindex <- order(DE_sab$logFC[genes4b])
genesset4b <- genesset4b[sortindex] #put genes2a in order of logFC between parents in control and treated
gencat<-c(genesset1,genesset2a,genesset2b,genesset3a,genesset3b,genesset4a,genesset4b)#pheat map order
#gencat<-c(genesset4b,genesset4a,genesset3b,genesset3a,genesset2b,genesset2a,genesset1)#heatmap order
gene4ase<-c(genesset1,genesset2a,genesset2b)
genenase<-c(genesset4b,genesset4a,genesset3b,genesset3a)
g1<-as.data.frame(genesset1)
g1$Group<-"Group1"
names(g1)[1]<-"GeneID"
g2<-as.data.frame(append(genesset2a, genesset2b))
g2$Group<-"Group2"
names(g2)[1]<-"GeneID"
g3<-as.data.frame(append(genesset3a, genesset3b))
g3$Group<-"Group3"
names(g3)[1]<-"GeneID"
g4<-as.data.frame(append(genesset4a, genesset4b))
g4$Group<-"Group4"
names(g4)[1]<-"GeneID"
g1234<-rbind(g1,g2)
g1234<-rbind(g1234,g3)
g1234<-rbind(g1234,g4)
g234<-rbind(g2,g3)
g234<-rbind(g234,g4)
g1im<-immunegenes[rownames(immunegenes) %in% g1$GeneID,]
g2im<-immunegenes[rownames(immunegenes) %in% g2$GeneID,]
g3im<-immunegenes[rownames(immunegenes) %in% g3$GeneID,]
g4im<-immunegenes[rownames(immunegenes) %in% g4$GeneID,]
###########Make a HEat map!########
testcpm<-as.data.frame(cpm(y))
testcpm<-testcpm[match(gencat, rownames(testcpm)),]
parents<-append(consetset, tset)
testcpm<-testcpm[,match(parents, names(testcpm))]
testcpm<-testcpm+.01
a4co2<-testcpm[,colnames(testcpm) %in% rownames(metadata[metadata$Treatment=="CO2" & metadata$genotype=="A4",])]
b6co2<-testcpm[,colnames(testcpm) %in% rownames(metadata[metadata$Treatment=="CO2" & metadata$genotype=="B6",])]
a4treat<-testcpm[,colnames(testcpm) %in% rownames(metadata[metadata$Treatment==cond1 & metadata$genotype=="A4",])]
b6treat<-testcpm[,colnames(testcpm) %in% rownames(metadata[metadata$Treatment==cond1 & metadata$genotype=="B6",])]
avelist<-list(a4co2,b6co2,a4treat,b6treat)
avelist<-lapply(avelist, function(x) cbind(x, ave=rowMeans(x)))
avecpm<-as.data.frame(cbind(avelist[[1]]$ave,avelist[[2]]$ave,avelist[[3]]$ave,avelist[[4]]$ave))
colnames(avecpm)<-c("A4Control","B6Control","A4Treated","B6Treated")
avecpm<-as.matrix(avecpm)
testcpm2<-as.matrix(testcpm)
##log transforming, centering and scaling
testcpm2<-log2(testcpm2)
center_scale <- function(x) {
scale(x, center=T)
}
testcpm2<-center_scale(testcpm2)
avecpm<-log2(avecpm)
avecpm<-center_scale(avecpm)
rownames(avecpm)<-rownames(testcpm)
rownames(g1234)<-g1234$GeneID
genegroups<-g1234[c("Group")]
genesub<-c(genesset2a,genesset2b,genesset3a,genesset3b,genesset4a,genesset4b)
testcpm3<-testcpm2[rownames(testcpm2) %in% genesub,]
genesub2<-c(genesset2a,genesset2b)
testcpm4<-testcpm2[rownames(testcpm2) %in% genesub2,]
genesub3<-c(genesset3a,genesset3b)
testcpm5<-testcpm2[rownames(testcpm2) %in% genesub3,]
genesub4<-c(genesset4a,genesset4b)
testcpm6<-testcpm2[rownames(testcpm2) %in% genesub4,]
###Gabbies heatmap code
col <- colorRampPalette(brewer.pal(11, "RdBu"))(10)#not sure what the 256 does
col2<-rev(col)
if (cond1 == "EFAE"){
Figure1A_efae<-pheatmap(avecpm, scale="row",
show_rownames = F,
legend = TRUE,
color=col2,
cluster_rows = F,
annotation_row = genegroups,
cluster_cols=F, main = "E. faecalis ")
write.table(gencat, "immgenes_efaefdr05.txt", quote = F, row.names = F)
write.table(g1234, "efae_imgcatfdr05.txt", quote = F, row.names = F)
}  else if (cond1 == "SMAR"){
Figure1A_smar<-pheatmap(avecpm, scale="row", show_rownames = F,
legend = TRUE, color=col2, cluster_rows = F,annotation_row = genegroups,
cluster_cols=F, main = "S. marcescens")
write.table(gencat, "immgenes_smarfdr05.txt", quote = F, row.names = F)
write.table(g1234, "smar_imgcatfdr05.txt", quote = F, row.names = F)
}   else{
print("I'm afraid I cant do that Dave.")
}
## Baloon plot comment
groupnames<-c("Group1","Group2","Group3","Group4")
tempefae<-as.numeric()
tempsmar<-as.numeric()
totsefae<-read.table("efae_imgcatfdr05.txt", header=T)
totssmar<-read.table("smar_imgcatfdr05.txt", header=T)
for (i in 1:length(groupnames)){
tempefae<-append(tempefae, nrow(totsefae[totsefae$Group==groupnames[i],]))
tempsmar<-append(tempsmar, nrow(totssmar[totssmar$Group==groupnames[i],]))
}
balloons<-as.data.frame(cbind(groupnames,tempefae,tempsmar))
row.names(balloons)<-c("No Effect", "Control Only", "Treated Only","Both")
balloons$tempefae<-as.numeric(as.character(balloons$tempefae))
balloons$tempsmar<-as.numeric(as.character(balloons$tempsmar))
balloons$groupnames<-NULL
colnames(balloons)<-c("E. faecalis", "S. marcescens")
figure1B<-ggballoonplot(balloons,size.range = c(7, 20), show.label = TRUE)+
theme_bw()+
ggtitle("Genotype Effects on Differentially Expressed Genes")+
ylab("Geneotype Effect") +
xlab("Infection")+
scale_fill_viridis_c(option = "C")
figure1B
######Immune response E vs S comparison ######
smarfull<-read.table("immgenes_smarfdr05.txt", header = T)
efaefull<-read.table("immgenes_efaefdr05.txt",header=T)
names(smarfull)[1]<-"smar"
smarfull$smarpresent<-TRUE
names(efaefull)[1]<-"efae"
efaefull$efaepresent<-TRUE
immune_comp<-merge(efaefull, smarfull, by=1, all= T)
immune_comp[is.na(immune_comp)]<-"FALSE"
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == TRUE) ##shared
sum(immune_comp$efaepresent == FALSE & immune_comp$smarpresent == TRUE) ##SMAR specific
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == FALSE) ## efae specific
shared<-immune_comp[immune_comp$efaepresent == TRUE & immune_comp$smarpresent == TRUE,]
g1im<-g1im[!rownames(g1im) %in% shared$efae,]
g2im<-g2im[!rownames(g2im) %in% shared$efae,]
g3im<-g3im[!rownames(g3im) %in% shared$efae,]
g4im<-g4im[!rownames(g4im) %in% shared$efae,]
###########HOWMANY ARE IMMUNE GENES#########
immunegenes$present<-TRUE
immune_comp<-merge(immune_comp, immunegenes, by.x=1, by.y=0, all.x=T)
immune_comp[is.na(immune_comp)]<-"FALSE" ##ignore the error messages child
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == TRUE & immune_comp$present == TRUE) ##shared
sum(immune_comp$efaepresent == FALSE & immune_comp$smarpresent == TRUE & immune_comp$present == TRUE) ##SMAR specific
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == FALSE & immune_comp$present == TRUE) ## efae specific
immunecore<-immune_comp[immune_comp$present==TRUE,]
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == TRUE & immune_comp$Core == TRUE) ##shared
sum(immune_comp$efaepresent == FALSE & immune_comp$smarpresent == TRUE & immune_comp$Core == TRUE) ##SMAR specific
sum(immune_comp$efaepresent == TRUE & immune_comp$smarpresent == FALSE & immune_comp$Core == TRUE) ## efae specific
immunecorecore<-immunecore[immunecore$efaepresent == TRUE & immunecore$smarpresent == TRUE & immunecore$Core == TRUE,]
#2) looking for immune or TFs in the differentially expressed stuff
efaetreat<-read.table("immgenes_efaefdr05.txt",header=T)
smartreat<-read.table("immgenes_smarfdr05.txt",header=T)
if (cond1=="EFAE"){
it_labs<-treat_de.genes[rownames(treat_de.genes) %in% efaetreat$x,]
} else if (cond1=="SMAR"){
it_labs<-treat_de.genes[rownames(treat_de.genes) %in% smartreat$x,]
} else{
print("I'm afraid i cant do that Dave" )
}
it_labs<-merge(it_labs, con_gene, by.x=0, by.y=2, all.x=T)
it_labs<-merge(it_labs, tfs, by.x=1, by.y=1, all.x=T)
it_labs<-merge(it_labs, immunegenes, by.x=1, by.y=0, all.x=T)
it_labs<-merge(it_labs, g1234, by=1, all.x=T)
#3) looking at the total genes between treated and control
totgenes<-merge(efaetreat, smartreat, by=1, all=T)
#write.table(totgenes, "controlvstreatedDE.txt", quote = F, row.names = F, col.names = F)
test4<-y$genes
test4<-test4[! test4$genes %in% totgenes,]
#write.table(test4, "detectedgenes.txt", quote = F, row.names = F, col.names = F)
##
success<-c(67,172)
trails<-c(1165,1203)
test1a<-prop.test(success,trails)
2*test1a$p.value # Bonferoni correction ( we only do 2 tests)
##sanity check you get the exact same P value as doing this
efaegenes<-c(67,1098)
smargenes<-c(172,1031)
testtable1a<-as.matrix(rbind(efaegenes,smargenes))
test1a<-prop.test(success,trails)
2*test1a$p.value
#for some reason when you provide x and n as vectors you give the total successes vs total trials BUT when X is a 2x2 matrix you give successes and failures so just keep that in mind.
#However i find the first version a little bit more intuitive to follow along with cause you can see the numbers
##
success<-c(433,91)
trails<-c(1165,1203)
test1b<-prop.test(success,trails)
2*test1b$p.value # Bonferoni correction ( we only do 2 tests)
#clean up our envirnment
rm(list=ls())
#function for later
RowVar <- function(x, ...) {
rowSums((x - rowMeans(x, ...))^2, ...)/(dim(x)[2] - 1)
}
#reload needed data
cv7cnd<-read.table("CV_complete_countmatrix.txt", header = T, row.names = "GENEID") #Gene level data
#reload needed data
cv7cnd<-read.table("CV_complete_countmatrix.txt", header = T, row.names = "GENEID") #Gene level data
metadata<-read.table("cv7_iso1_meta.txt", header = T, row.names="Sample")
totsefae<-read.table("efae_imgcatfdr05.txt", header=T)
totsefae<-read.table("efae_imgcatfdr05.txt", header=T)
totssmar<-read.table("smar_imgcatfdr05.txt", header=T)
con_gene<-read.table("gene_symbol_conversions4.txt", header = T)
con_gene$RefSeq<-NULL
con_gene<-unique(con_gene)
##filter out the data we need
tempcounts<-cv7cnd[,names(fullcounts) %in% rownames(metadata[metadata$Generation=="f0",])]
##filter out the data we need
tempcounts<-cv7cnd[,names(cv7cnd) %in% rownames(metadata[metadata$Generation=="f0",])]
y <- DGEList(counts=tempcounts, genes=rownames(tempcounts))
#reload needed data
cv7cnd<-read.table("CV_complete_countmatrix.txt", header = T, row.names = "GENEID") #Gene level data
#reload needed data
cv7cnd<-read.table("CV_complete_countmatrix.txt", header = T, row.names = "GENEID") #Gene level data
cv7cnd<-cv7cnd[complete.cases(cv7cnd),]
metadata<-read.table("cv7_iso1_meta.txt", header = T, row.names="Sample")
totsefae<-read.table("efae_imgcatfdr05.txt", header=T)
totsefae<-read.table("efae_imgcatfdr05.txt", header=T)
totssmar<-read.table("smar_imgcatfdr05.txt", header=T)
con_gene<-read.table("gene_symbol_conversions4.txt", header = T)
con_gene$RefSeq<-NULL
con_gene<-unique(con_gene)
cv7cnd<-cv7cnd[complete.cases(cv7cnd),]
metadata<-read.table("cv7_iso1_meta.txt", header = T, row.names="Sample")
totsefae<-read.table("efae_imgcatfdr05.txt", header=T)
totssmar<-read.table("smar_imgcatfdr05.txt", header=T)
con_gene<-read.table("gene_symbol_conversions4.txt", header = T)
con_gene$RefSeq<-NULL
con_gene<-unique(con_gene)
##filter out the data we need
tempcounts<-cv7cnd[,names(cv7cnd) %in% rownames(metadata[metadata$Generation=="f0",])]
y <- DGEList(counts=tempcounts, genes=rownames(tempcounts))
y<-calcNormFactors(y)
#filter out low expressed genes.
isexpr <- rowSums(cpm(y)>1) >= 5
y <- y[isexpr, , keep.lib.sizes=FALSE] ##to remove low expressed stuff
y <- calcNormFactors(y)
#now we can work with these values
tempcpm<-as.data.frame(cpm(y$counts))
nrow(tempcpm)
#Calculate average cpm/ gene for C02, EFAE, SMAR samples
tempcpm$CO2<-rowMeans(tempcpm[, rownames(metadata[metadata$Generation=="f0" & metadata$Treatment=="CO2",])])
tempcpm$EFAE<-rowMeans(tempcpm[, rownames(metadata[metadata$Generation=="f0" & metadata$Treatment=="EFAE",])])
tempcpm$SMAR<-rowMeans(tempcpm[, rownames(metadata[metadata$Generation=="f0" & metadata$Treatment=="SMAR",])])
tempecpm<-tempcpm[rownames(tempcpm) %in% fullefae$GeneID,c("CO2","EFAE")]
tempecpm<-tempcpm[rownames(tempcpm) %in% totsefae$GeneID,c("CO2","EFAE")]
tempscpm<-tempcpm[rownames(tempcpm) %in% totssmar$GeneID,c("CO2","SMAR")]
colnames(tempecpm)[1]<-"E_CO2"
colnames(tempscpm)[1]<-"S_CO2"
tempdegcpm<-merge(tempecpm,tempscpm, by=0, all=TRUE )
tempdegcpm<-merge(tempecpm,tempscpm, by=0, all=TRUE )
tempdegcpm$Row.names<-NULL
cpmdeg<-gather(tempdegcpm,"Treatment", "AverageCPM" )
##different genotype effect groups
tempec1<-tempecpm[rownames(tempecpm) %in% fullefae[fullefae$Group=="Group1",1], ]
##different genotype effect groups
tempec1<-tempecpm[rownames(tempecpm) %in% totsefae[totsefae$Group=="Group1",1], ]
tempec2<-tempecpm[rownames(tempecpm) %in% totsefae[totsefae$Group=="Group2",1], ]
tempec3<-tempecpm[rownames(tempecpm) %in% totsefae[totsefae$Group=="Group3",1], ]
tempec4<-tempecpm[rownames(tempecpm) %in% totsefae[totsefae$Group=="Group4",1], ]
tempsc1<-tempscpm[rownames(tempscpm) %in% totssmar[totssmar$Group=="Group1",1], ]
tempsc2<-tempscpm[rownames(tempscpm) %in% totssmar[totssmar$Group=="Group2",1], ]
tempsc3<-tempscpm[rownames(tempscpm) %in% totssmar[totssmar$Group=="Group3",1], ]
tempsc4<-tempscpm[rownames(tempscpm) %in% totssmar[totssmar$Group=="Group4",1], ]
colnames(tempec1)<-c("g1_co2","g1efae")
colnames(tempec2)<-c("g2_co2","g2efae")
colnames(tempec3)<-c("g3_co2","g3efae")
colnames(tempec4)<-c("g4_co2","g4efae")
colnames(tempsc1)<-c("g1_co2","g1smar")
colnames(tempsc2)<-c("g2_co2","g2smar")
colnames(tempsc3)<-c("g3_co2","g3smar")
colnames(tempsc4)<-c("g4_co2","g4smar")
cpmefae<-rbind(gather(tempec1,"Treatment", "AverageCPM"),
gather(tempec2,"Treatment", "AverageCPM"),
gather(tempec3,"Treatment", "AverageCPM"),
gather(tempec4,"Treatment", "AverageCPM"))
cpmsmar<-rbind(gather(tempsc1,"Treatment", "AverageCPM"),
gather(tempsc2,"Treatment", "AverageCPM"),
gather(tempsc3,"Treatment", "AverageCPM"),
gather(tempsc4,"Treatment", "AverageCPM"))
##now we plot
#make gg objects
C1deg<-ggplot(cpmdeg, aes(x=`Treatment`, y=`AverageCPM`))
C1efae<-ggplot(cpmefae, aes(x=`Treatment`, y=`AverageCPM`))
C1smar<-ggplot(cpmsmar, aes(x=`Treatment`, y=`AverageCPM`))
c1degmeans <- aggregate(AverageCPM ~  Treatment, cpmdeg, mean)
c1degmeans$AverageCPM<-round(c1degmeans$AverageCPM,2)
C1deg<- C1deg+
geom_violin(trim=FALSE)+
ggtitle("AverageCPM per Gene for Differentially Expressed Genes\n1165-Efae, 1203-Smar")+
ylim(0,250)+
geom_text(data = c1degmeans, aes(label = AverageCPM))
C1deg
c1efaemeans <- aggregate(AverageCPM ~  Treatment, cpmefae, mean)
c1efaemeans$AverageCPM<-round(c1efaemeans$AverageCPM,2)
efaelabs<-c("No Genotype Effects\n(Group 1 CO2)", "No Genotype Effects\n(Group 1 EFAE)",
"Genotype Effects in Control\n(Group 2 CO2)", "Genotype Effects in Control\n(Group 2 EFAE)",
"Genotype Effects in Treated\n(Group 3 CO2)", "Genotype Effects in Treated\n(Group 3 EFAE)",
"Genotype Efects in Both\n(Group 4 CO2)","Genotype Efects in Both\n(Group 4 EFAE)")
C1efae<-C1efae+
geom_violin(trim=FALSE)+
ggtitle("AverageCPM per Gene\nEfae Differentially Expressed Genes (1165)")+
ylim(0,300)+
geom_text(data = c1efaemeans, aes(label = AverageCPM))+
scale_x_discrete(labels=efaelabs)+
theme(axis.text.x=element_text(angle=45, hjust=1))
C1efae
#then we look at the smar specific genes and seperate these furtehr by genotype effect groups
c1smarmeans <- aggregate(AverageCPM ~  Treatment, cpmsmar, mean)
c1smarmeans$AverageCPM<-round(c1smarmeans$AverageCPM,2)
smarlabs<-c("No Genotype Effects\n(Group 1 CO2)", "No Genotype Effects\n(Group 1 SMAR)",
"Genotype Effects in Control\n(Group 2 CO2)", "Genotype Effects in Control\n(Group 2 SMAR)",
"Genotype Effects in Treated\n(Group 3 CO2)", "Genotype Effects in Treated\n(Group 3 SMAR)",
"Genotype Efects in Both\n(Group 4 CO2)","Genotype Efects in Both\n(Group 4 SMAR)")
C1smar<- C1smar+
geom_violin(trim=FALSE)+
ggtitle("AverageCPM per Gene\nSmar Differentially Expressed Genes (1203)")+
ylim(0,500)+
geom_text(data = c1smarmeans, aes(label = AverageCPM))+
scale_x_discrete(labels=smarlabs)+
theme(axis.text.x=element_text(angle=45, hjust=1))
C1smar
#Pvalue tests
#first we test for significance in the full set of genes
efaeC1deg<-wilcox.test(cpmdeg[cpmdeg$Treatment=="E_CO2",2], cpmdeg[cpmdeg$Treatment=="EFAE",2],paired=FALSE,aternative="two.sided")
smarC1deg<-wilcox.test(cpmdeg[cpmdeg$Treatment=="S_CO2",2],cpmdeg[cpmdeg$Treatment=="SMAR",2],paired=FALSE,aternative="two.sided")
##Next we want to test for significance
tempgroups<-c("g1","g2","g3","g4")
efaesig<-as.numeric()
smarsig<-as.numeric()
#loop for performing wilcoxon rank sum test on each genotype effect groups ( testing between control and treated for each group)
for (i in 1:length(tempgroups)){
#subset efae data
tempeco2<-cpmefae[cpmefae$Treatment==paste0(tempgroups[i],"_co2"),"AverageCPM"]
tempefae<-cpmefae[cpmefae$Treatment==paste0(tempgroups[i],"efae"),"AverageCPM"]
#subset smar data
tempsco2<-cpmsmar[cpmsmar$Treatment==paste0(tempgroups[i],"_co2"),"AverageCPM"]
tempsmar<-cpmsmar[cpmsmar$Treatment==paste0(tempgroups[i],"smar"),"AverageCPM"]
#test for efae
tempep<-wilcox.test(tempeco2, tempefae,paired=FALSE,aternative="two.sided")
#test for smar
tempsp<-wilcox.test(tempsco2, tempsmar,paired=FALSE,aternative="two.sided")
#add pvalues to vector
efaesig<-append(efaesig, tempep$p.value)
smarsig<-append(smarsig, tempsp$p.value)
rm(tempeco2,tempefae,tempsco2,tempsmar,tempep,tempsp)
}
sigs<-as.data.frame(cbind(tempgroups,round(efaesig,5),round(smarsig,5)))
colnames(sigs)<-c("Gene_Group","Efae_sig","Smar_sig")
sigs$efae_corrected<-as.numeric(as.character(sigs$Efae_sig))*5
sigs$smar_corrected<-as.numeric(as.character(sigs$Smar_sig))*5
rm(list=ls(pattern="temp"))
efaeC1deg$p.value*5
smarC1deg$p.value *5
sigs
smarC1deg$p.value
10+433+57
88+91+84
C1deg
C1efae
C1smar
c1smarmeans
library(fitdistrplus)
citation("fitdistrplus")
##load Libraries
library(dplyr)
##load Libraries
library(dplyr)
library(rlang)
library(ggplot2)
library(reshape2)
library(KSgeneral)
rm(list=ls())
#import all the tables:
names<-as.character(strsplit(list.files(pattern="*.txt"), ".txt"))
myfullfiles = lapply(list.files(pattern="*.txt"), read.delim)
myfullfiles = lapply(list.files(pattern="*.txt"), read.delim)
#rows that have "-" in the allele column generally have - on all the other columns so we remove them
myfullfiles <- lapply(myfullfiles, function(x) x[!x$Allele=="-",])
#we are going to assume that if they have the same location gene name and amino acid change that it doesnt matter if they are different isoforms.
#HOWEVER this filtering DOES allow for snps that result if different amino acids depending on transcript
myfullfiles <- lapply(myfullfiles, function(x) unique(x[,c("Location", "Allele", "Consequence", "IMPACT","SYMBOL", "Gene", "Protein_position", "Amino_acids", "Codons", "BLOSUM62")]))
myfullfiles[[1]]
#import all the tables:
names<-c("deimmune","detreated","exnonDEimm","expressed")
myfullfiles = lapply(list.files(pattern="*.txt"), read.delim)
#rows that have "-" in the allele column generally have - on all the other columns so we remove them
myfullfiles <- lapply(myfullfiles, function(x) x[!x$Allele=="-",])
#we are going to assume that if they have the same location gene name and amino acid change that it doesnt matter if they are different isoforms.
#HOWEVER this filtering DOES allow for snps that result if different amino acids depending on transcript
myfullfiles <- lapply(myfullfiles, function(x) unique(x[,c("Location", "Allele", "Consequence", "IMPACT","SYMBOL", "Gene", "Protein_position", "Amino_acids", "Codons", "BLOSUM62")]))
#import all the tables:
names<-c("deimmune","detreated","exnonDEimm","expressed","transcources")
myfullfiles = lapply(list.files(pattern="*.txt"), read.delim)
#rows that have "-" in the allele column generally have - on all the other columns so we remove them
myfullfiles <- lapply(myfullfiles, function(x) x[!x$Allele=="-",])
#we are going to assume that if they have the same location gene name and amino acid change that it doesnt matter if they are different isoforms.
#HOWEVER this filtering DOES allow for snps that result if different amino acids depending on transcript
myfullfiles <- lapply(myfullfiles, function(x) unique(x[,c("Location", "Allele", "Consequence", "IMPACT","SYMBOL", "Gene", "Protein_position", "Amino_acids", "Codons", "BLOSUM62")]))
myfullfiles[[1]]
myfullfiles[[2]]
#import all the tables:
names<-c("deimmune","detreated","exnonDEimm","expressed","transcources")
myfullfiles = lapply(list.files(pattern="*.txt"), read.delim)
#rows that have "-" in the allele column generally have - on all the other columns so we remove them
myfullfiles <- lapply(myfullfiles, function(x) x[!x$Allele=="-",])
#we are going to assume that if they have the same location gene name and amino acid change that it doesnt matter if they are different isoforms.
#HOWEVER this filtering DOES allow for snps that result if different amino acids depending on transcript
myfullfiles <- lapply(myfullfiles, function(x) unique(x[,c("Location", "Allele", "Consequence", "IMPACT","SYMBOL", "Gene", "Protein_position", "Amino_acids", "Codons", "BLOSUM62")]))
rm(list=ls())
#import all the tables:
names<-c("deimmune.txt","detreated.txt","exnonDEimm.txt","expressed.txt","transcources.txt")
myfullfiles = lapply(names, read.delim)
#rows that have "-" in the allele column generally have - on all the other columns so we remove them
myfullfiles <- lapply(myfullfiles, function(x) x[!x$Allele=="-",])
#we are going to assume that if they have the same location gene name and amino acid change that it doesnt matter if they are different isoforms.
#HOWEVER this filtering DOES allow for snps that result if different amino acids depending on transcript
myfullfiles <- lapply(myfullfiles, function(x) unique(x[,c("Location", "Allele", "Consequence", "IMPACT","SYMBOL", "Gene", "Protein_position", "Amino_acids", "Codons", "BLOSUM62")]))
