Summary

This script is 2nd script of 4 scripts meant to process data in R. This script deals only with the PARSED reads aligned to Either A4 or B6.

This script is broken up into 8 main sections

  1. Xchromosome contamination
  2. Testing for differentially expressed genes in parents
  3. Testing for allelic imbalance
  4. Testing for trans effects
  5. Assigning categorization
  6. Plotting
  7. Additional Comparisons
  8. P-value Calculation

This script needs the following files to run :

  1. Count data : CV_complete_countmatrix.txt
  2. Metadata for parsed sample: cv_asap_meta.txt
  3. Xchromosome genes: dm6_chrXrefGenes.txt
  4. Table of gene conversions : gene_symbol_conversions4.txt
  5. High confidence genes to use for the analysis: poi01_genes.txt
  6. immunelist_LB_2019_cat.txt
  7. list of drosophila TFs : tf_Celniker2013_v2.txt
  8. File of cis and trans effects in the control sample. This file is technically generated by this script but is included here in case the user does not feel like running all 3 treatment conditions to generate the necessary files : control_signal.txt
  9. File of cis and trans effects in the Efae sample. This file is technically generated by this script but is included here in case the user does not feel like running all 3 treatment conditions to generate the necessary files :efae_cistrans.txt
  10. File of cis and trans effects in the smar sample. This file is technically generated by this script but is included here in case the user does not feel like running all 3 treatment conditions to generate the necessary files : smar_cistrans.txt
  11. summary of cis and trans effects in each treatment: signal_comp.txt

Section 1: Xchromosome Contamination

Please not that the user can specify the ‘cond’ variable to either ‘CO2’,‘EFAE’ or ‘SMAR’.

##Load up your environment
library(edgeR)
## Loading required package: limma
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
library(reshape2)
library(matrixStats)
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.2
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
library(ggfortify)
library(devtools)
## Loading required package: usethis
library(FactoMineR)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(ggbiplot)
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## Loading required package: scales
## Loading required package: grid
## 
## Attaching package: 'ggbiplot'
## The following object is masked from 'package:ggfortify':
## 
##     ggbiplot
library(limma)
library(schoolmath)
rm(list=ls())
#setwd("~/Desktop/Wunderlichlab/Cis_var_project/Cisvar7/DSPR_alig/Try9")
#### CHOOSE PARAMETERS####

##User can specify condition from the below 3 options
cond<-"CO2"
#cond<-"EFAE"
#cond<-"SMAR"

#### LOAD DATA #####
metadata<-read.table("cv_asap_meta.txt", header = T, row.names = "Sample")
cv7asapc<- read.table("CV_complete_countmatrix.txt", row.names="GENEID", header=TRUE)
cv7asapc<-cv7asapc[,-(1:48)]
cv7asapc <-cv7asapc[complete.cases(cv7asapc),]
xgenes<- read.table("dm6_chrXrefGenes.txt", header=F)
con_gene<-read.table("gene_symbol_conversions4.txt", header = T)
con_gene$Gene_symbol<-NULL
goodgenes<-read.table("poi01_genes.txt", header = T)
immunegenes<-read.table("immunelist_LB_2019_cat.txt", header = T, row.names = "FlybaseID")
tfs<-read.table("tf_Celniker2013_v2.txt", header=T)
cdegenes2toss<-read.table("control_signal.txt", header=T)
con_comp<-read.table("signal_comp.txt", header=T)
esig<-read.table("efae_cistrans.txt", header = T)
ssig<-read.table("smar_cistrans.txt", header = T)
##Remove the ChrX genes:
xconverted<-merge(xgenes, con_gene, by.x=1, by.y=1, all.x=T)

cv7goodc<-cv7asapc[ rownames(cv7asapc) %in% (goodgenes$goodgenes),] ### v2 plots have all been filtered for okgenes 
cv7nox<-cv7goodc[! rownames(cv7goodc) %in% (xconverted$Primary_FBgn),]



### ANd now we begin.
cv7_asapx<-cv7asapc[ rownames(cv7asapc) %in% (xconverted$Primary_FBgn),]
xsums<-colSums(cv7_asapx,na.rm = T) ##this doesnt like NAs
xsums<-as.data.frame(xsums)
###ok so now lets pull out the meta data
parsed<-(metadata$Parsed==TRUE)
sampleset<-rownames(metadata)[parsed]
xsums$Samp<-rownames(xsums)
xsums<-xsums[rownames(xsums) %in% sampleset,]
metadata2<-metadata[rownames(metadata) %in% sampleset,]
wholesamp<-strsplit(rownames(xsums), "_g")
wholesamp<-as.data.frame(wholesamp, stringsAsFactors = F)
wholesamp<-as.character(wholesamp[1,])
xsums<-cbind(xsums, wholesamp)
xsums<-merge(xsums, metadata2, by=0)
sampleord<-as.character(c("B18_209","B18_216","B18_222","B19_087",
                       "B18_231","B18_233","B19_086","B19_093",
                       "B18_200","B18_230","B18_232","B19_092",
                       "B18_198","B18_217","B18_224","B19_085",
                       "B19_075","B19_082","B19_091","B19_095",
                       "B19_078","B19_081","B19_090","B19_097",
                       "B19_077","B19_079","B19_089","B19_094",
                       "B19_076","B19_080","B19_088","B19_096",
                       "B18_203","B18_220","B18_226","B19_073",
                       "B18_205","B18_234","B18_235","B18_236","B19_084",
                       "B18_204","B18_210","B18_219","B19_083",
                       "B18_202","B18_228","B19_074"))
####Calculate percentages
Percent<-as.numeric()
for(i in 1:length(rownames(xsums)))
{
  if(i%%2!=0){
    ii<-(xsums[i,2]/(xsums[i,2]+xsums[i+1,2]))*100
    Percent<-append(Percent, ii)
  }else if(i%%2==0){
    iii<-(xsums[i,2]/(xsums[i,2]+xsums[i-1,2]))*100
    Percent<-append(Percent, iii)
  }
}

xsums<-cbind(xsums,Percent)
xsums$wholesamp <-factor(xsums$wholesamp, levels = sampleord)
###
ggplot(data=xsums, aes(x=wholesamp, y=Percent, fill=Alignment)) +
  geom_bar(stat="identity")+
  xlab("Sample")+
  ggtitle("Xchromosome Misslabeling")+
  theme(axis.title.x = element_text(size = 20), axis.title.y = element_text(size = 20), 
        axis.text.x = element_text(size= 16,angle = 90, vjust = 1, hjust = 1), 
        axis.text.y = element_text(size= 16))+
  scale_fill_manual(breaks = c("a4","b6"), values=c("#00bfc4","#f8766d"))

Section 2: Testing for diferentially expressed genes in parents

In order to identify changes in cis we need to see differential expression in the parents that is maintained to some extent in the F1-hybrids. So for the first test we simply look for differential expression in the parental samples for the specified condition.

#select the samples we will be using
if (cond == "SMAR"){
  parents<-(metadata$Treatment=="SMAR") & (metadata$Generation=="F0") & (metadata$Parsed == FALSE) 
  hybrids<-(metadata$Treatment=="SMAR") & (metadata$Generation=="F1")
  hybridsCi<-(metadata$Treatment=="SMAR") & (metadata$Generation=="F1") &(metadata$Rep=="1") | (metadata$Treatment=="SMAR") & (metadata$Generation=="F1") &(metadata$Rep=="2")
  hybridsTr<-(metadata$Treatment=="SMAR") & (metadata$Generation=="F1") &(metadata$Rep=="3") | (metadata$Treatment=="SMAR") & (metadata$Generation=="F1") &(metadata$Rep=="4")
  sampset<-rownames(metadata)[parents]
}  else if (cond == "CO2"){
  parents<-(metadata$Treatment=="CO2") & (metadata$Generation=="F0") & (metadata$Parsed == FALSE) 
  hybrids<-(metadata$Treatment=="CO2") & (metadata$Generation=="F1")
  hybridsCi<-(metadata$Treatment=="CO2") & (metadata$Generation=="F1") &(metadata$Rep=="1") | (metadata$Treatment=="CO2") & (metadata$Generation=="F1") &(metadata$Rep=="3")
  hybridsTr<-(metadata$Treatment=="CO2") & (metadata$Generation=="F1") &(metadata$Rep=="2") | (metadata$Treatment=="CO2") & (metadata$Generation=="F1") &(metadata$Rep=="4")
}  else if (cond == "EFAE"){
  parents<-(metadata$Treatment=="EFAE") & (metadata$Generation=="F0") & (metadata$Parsed == FALSE) 
  hybrids<-(metadata$Treatment=="EFAE") & (metadata$Generation=="F1")
  hybridsCi<-(metadata$Treatment=="EFAE") & (metadata$Generation=="F1") &(metadata$Rep=="1") | (metadata$Treatment=="EFAE") & (metadata$Generation=="F1") &(metadata$Rep=="2")
  hybridsTr<-(metadata$Treatment=="EFAE") & (metadata$Generation=="F1") &(metadata$Rep=="3") | (metadata$Treatment=="EFAE") & (metadata$Generation=="F1") &(metadata$Rep=="4")
}  else{
  print("I'm afraid I cant do that. : ] ")
}

parents<-rownames(metadata)[parents]
hybrids<-rownames(metadata)[hybrids]
hybridsCi<-rownames(metadata)[hybridsCi]
hybridsTr<-rownames(metadata)[hybridsTr]

test<-append(parents,hybrids)
####YOU GOTTA REORDER THE META DATA BECAUSE WHEN YOU SUBSET BASED ON A CHARACTER VECTOR THE ORDER OF THE DF IS MAINTAINED NOT OF THE VECTOR!
metadata<-metadata[rownames(metadata) %in% test,]
metadata<-metadata[match(test,(rownames(metadata))),]

thresh<-length(cv7nox)/2
isexpr <- rowSums(cpm(cv7nox)>1) >= thresh
cv7nox <- cv7nox[isexpr,]
####  diffferentially expressed genes - parents ####
#only need treated parental samples for the first part

cv7parent<-cv7nox[,names(cv7nox) %in% parents]
cv7parent<-setcolorder(cv7parent, parents)
y <- DGEList(counts=cv7parent, genes=rownames(cv7parent))
y <- calcNormFactors(y)
#assign genotype

GT<-factor(c(metadata$Genotype[rownames(metadata) %in% parents])) ##1=a4, 3 = b6
BC<-factor(c(metadata$Batch[rownames(metadata) %in% parents]))

data.frame(Sample=colnames(y), GT, BC)
##    Sample GT BC
## 1 B18_209  1  1
## 2 B18_216  1  2
## 3 B18_222  1  3
## 4 B19_087  1  8
## 5 B18_198  3  0
## 6 B18_217  3  2
## 7 B18_224  3  3
## 8 B19_085  3  8
design <- model.matrix(~GT+BC)
tdesign <- model.matrix(~GT)
rownames(design) <- colnames(y)
design
##         (Intercept) GT3 BC1 BC2 BC3 BC8
## B18_209           1   0   1   0   0   0
## B18_216           1   0   0   1   0   0
## B18_222           1   0   0   0   1   0
## B19_087           1   0   0   0   0   1
## B18_198           1   1   0   0   0   0
## B18_217           1   1   0   1   0   0
## B18_224           1   1   0   0   1   0
## B19_085           1   1   0   0   0   1
## attr(,"assign")
## [1] 0 1 2 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$GT
## [1] "contr.treatment"
## 
## attr(,"contrasts")$BC
## [1] "contr.treatment"
y <- estimateDisp(y, design, robust=TRUE)
y$common.dispersion
## [1] 0.2090148
pcpm<-cpm(y)

fit <- glmFit(y, design)
####topTags
Parent_lrt <- glmLRT(fit, coef=2)
DE_parents <- Parent_lrt$table
FDR <- p.adjust(DE_parents$PValue, method="fdr")
DE_parents <- cbind(DE_parents, FDR, y$genes$genes)
colnames(DE_parents)[6] <- "gene"
DE_parents$parentDE_sig <- DE_parents$FDR <= 0.05
Pa_de.genes <- DE_parents[DE_parents$FDR <= 0.05, ] ## 547:control, 386:treated

## tidy up our objects
##if it doesnt spark joy remove it
rm( design, FDR, fit, GT, isexpr, Parent_lrt, y, BC)

Section 3: Testing for allelic imbalance

In order to identify changes in cis we need evidence of allelic imbalance in the parents and F1-hybrids. So for the second test we look for allelic imbalance in the F1 hybrids.

#### ASE - hybrids ####
cv7_F1s<-cv7nox[,names(cv7nox) %in% hybridsCi]
cv7_F1s<-setcolorder(cv7_F1s, hybridsCi)
z <- DGEList(counts=cv7_F1s, genes=rownames(cv7_F1s))
z <- calcNormFactors(z)
##
AG <- factor(c(metadata$Alignment[rownames(metadata) %in% hybridsCi])) ## 0=A4, 1=B6
MG <- factor(c(metadata$MG[rownames(metadata) %in% hybridsCi])) ## 1=B6 mom, 0=A4 mom
PO <- factor(c(metadata$PO[rownames(metadata) %in% hybridsCi])) ## 0=inhiereted from father, 1=inhiereted from mother
BC <- factor(c(metadata$Batch[rownames(metadata) %in% hybridsCi])) ## extraction batch

data.frame(Sample=colnames(z), AG, PO, MG, BC)
##       Sample AG PO MG BC
## 1 B18_231_g1  1  0  1  4
## 2 B18_231_g2  2  1  1  4
## 3 B19_086_g1  1  0  1  8
## 4 B19_086_g2  2  1  1  8
## 5 B18_230_g1  1  1  0  4
## 6 B18_230_g2  2  0  0  4
## 7 B19_092_g1  1  1  0  9
## 8 B19_092_g2  2  0  0  9
design <- model.matrix(~AG+PO+MG+BC)
rownames(design) <- colnames(z)
design
##            (Intercept) AG2 PO1 MG1 BC8 BC9
## B18_231_g1           1   0   0   1   0   0
## B18_231_g2           1   1   1   1   0   0
## B19_086_g1           1   0   0   1   1   0
## B19_086_g2           1   1   1   1   1   0
## B18_230_g1           1   0   1   0   0   0
## B18_230_g2           1   1   0   0   0   0
## B19_092_g1           1   0   1   0   0   1
## B19_092_g2           1   1   0   0   0   1
## attr(,"assign")
## [1] 0 1 2 3 4 4
## attr(,"contrasts")
## attr(,"contrasts")$AG
## [1] "contr.treatment"
## 
## attr(,"contrasts")$PO
## [1] "contr.treatment"
## 
## attr(,"contrasts")$MG
## [1] "contr.treatment"
## 
## attr(,"contrasts")$BC
## [1] "contr.treatment"
z <- estimateDisp(z, design, robust=TRUE)
z$common.dispersion
## [1] 0.005136854
hcpm<-cpm(z)
fit <- glmFit(z, design)

F1_lrt_AG <- glmLRT(fit, coef=2)
f1_AG <- F1_lrt_AG$table
FDR <- p.adjust(f1_AG$PValue, method="fdr")
f1_AG <- cbind(f1_AG, FDR, z$genes$genes)
colnames(f1_AG)[6] <- "gene"
f1_AG$ASE <- f1_AG$FDR <= 0.05
f1_AG_de.genes <- f1_AG[f1_AG$FDR <= 0.05, ] ###1056:control, 1082:treated

##clean up our space
rm(design, FDR, fit, z, F1_lrt_AG, AG, BC, MG, PO)

Section 4 : Testing for trans effects

sampl<-as.character()
sampl<-append(parents,hybridsTr)
transmat<-cv7nox[,names(cv7nox) %in% sampl]
transmat<-setcolorder(transmat, sampl)

GN <- factor(c(metadata$Alignment[rownames(metadata) %in% sampl])) #genotype : 0=a4 reads , 1=b6 reads
OG <- factor(c(metadata$Generation[rownames(metadata) %in% sampl]))##Origin : 0= parents, 1= hybrids
BT <- factor(c(metadata$Batch[rownames(metadata) %in% sampl])) 

zz <- DGEList(counts=transmat, genes=rownames(transmat))

data.frame(Sample=colnames(zz), GN, OG, BT)
##        Sample GN OG BT
## 1     B18_209  1  1  1
## 2     B18_216  1  1  2
## 3     B18_222  1  1  3
## 4     B19_087  1  1  8
## 5     B18_198  2  1  0
## 6     B18_217  2  1  2
## 7     B18_224  2  1  3
## 8     B19_085  2  1  8
## 9  B18_233_g1  1  2  4
## 10 B18_233_g2  2  2  4
## 11 B19_093_g1  1  2  9
## 12 B19_093_g2  2  2  9
## 13 B18_200_g1  1  2  0
## 14 B18_200_g2  2  2  0
## 15 B18_232_g1  1  2  4
## 16 B18_232_g2  2  2  4
design <- model.matrix(~0+BT+GN*OG)
rownames(design) <- colnames(zz)
design
##            BT0 BT1 BT2 BT3 BT4 BT8 BT9 GN2 OG2 GN2:OG2
## B18_209      0   1   0   0   0   0   0   0   0       0
## B18_216      0   0   1   0   0   0   0   0   0       0
## B18_222      0   0   0   1   0   0   0   0   0       0
## B19_087      0   0   0   0   0   1   0   0   0       0
## B18_198      1   0   0   0   0   0   0   1   0       0
## B18_217      0   0   1   0   0   0   0   1   0       0
## B18_224      0   0   0   1   0   0   0   1   0       0
## B19_085      0   0   0   0   0   1   0   1   0       0
## B18_233_g1   0   0   0   0   1   0   0   0   1       0
## B18_233_g2   0   0   0   0   1   0   0   1   1       1
## B19_093_g1   0   0   0   0   0   0   1   0   1       0
## B19_093_g2   0   0   0   0   0   0   1   1   1       1
## B18_200_g1   1   0   0   0   0   0   0   0   1       0
## B18_200_g2   1   0   0   0   0   0   0   1   1       1
## B18_232_g1   0   0   0   0   1   0   0   0   1       0
## B18_232_g2   0   0   0   0   1   0   0   1   1       1
## attr(,"assign")
##  [1] 1 1 1 1 1 1 1 2 3 4
## attr(,"contrasts")
## attr(,"contrasts")$BT
## [1] "contr.treatment"
## 
## attr(,"contrasts")$GN
## [1] "contr.treatment"
## 
## attr(,"contrasts")$OG
## [1] "contr.treatment"
zz <- estimateDisp(zz, design, robust=TRUE)
zz$common.dispersion
## [1] 0.1229924
fit <- glmFit(zz, design)

if (cond == "CO2"){
  teff_lrt_int <- glmLRT(fit, coef=10)
}  else if (cond == "SMAR"){
  teff_lrt_int <- glmLRT(fit, coef=9)
}  else if (cond == "EFAE"){
  teff_lrt_int <- glmLRT(fit, coef=7)
}


t_eff <- teff_lrt_int$table
FDR <- p.adjust(t_eff$PValue, method="fdr")
t_eff <- cbind(t_eff, FDR, zz$genes$genes)
colnames(t_eff)[6] <- "gene"
t_eff$trans_sig <- t_eff$FDR <= 0.05
t_eff_de.genes <- t_eff[t_eff$FDR <= 0.05, ]

Section 5 : Assigning categorizing

Now we can assign each gene a cis/trans category depending on the results from sections 2-4. The logic is explained here: PGRP-SB1 Binding sites

#first we combine the parental and hybrid DE genes and use that to pull out rows from our larger data matrix 

colnames(DE_parents)[1]<-"PLogFC"
colnames(f1_AG)[1]<-"FLogFC"
genesDE<-merge(DE_parents, f1_AG, by=0, all = TRUE)
rownames(genesDE)<-genesDE[,1]
col2keep<-as.character(c("Row.names","PLogFC","FLogFC","parentDE_sig","ASE"))
genesDE<-genesDE[,names(genesDE) %in% col2keep]
genesDE<-merge(genesDE, t_eff, by=0, all = TRUE)
## Warning in merge.data.frame(genesDE, t_eff, by = 0, all = TRUE): column
## name 'Row.names' is duplicated in the result
col2keep<-as.character(c("Row.names","PLogFC","FLogFC","parentDE_sig","ASE","trans_sig"))
genesDE<-genesDE[,names(genesDE) %in% col2keep]
rownames(genesDE)<-genesDE[,1]
genesDE$Row.names.1<-NULL
genesDE$trans_sig <- ifelse(is.na(genesDE$trans_sig), 'FALSE', genesDE$trans_sig)

### assign categories
test<-integer()
categories<-character()
for(i in 1:length(genesDE[,1]))
{
  gene<-genesDE[i,]
  if ( isTRUE(as.logical(gene$parentDE_sig)) & isTRUE(as.logical(gene$ASE)) & gene$trans_sig=="FALSE") {
    categories<-append(categories, "cis")
  } else if (isTRUE(as.logical(gene$parentDE_sig)) & gene$ASE == "FALSE" & isTRUE(as.logical(gene$trans_sig ))) {
    categories<-append(categories, "trans")
  } else if (isTRUE(as.logical(gene$parentDE_sig)) & isTRUE(as.logical(gene$ASE)) & isTRUE(as.logical(gene$trans_sig ))) {
    categories<-append(categories, "ct")
  } else if (gene$parentDE_sig== "FALSE" & isTRUE(as.logical(gene$ASE)) & isTRUE(as.logical(gene$trans_sig ))) {
    categories<-append(categories, "comp")
  } else if (gene$parentDE_sig== "FALSE" & gene$ASE == "FALSE" & gene$trans_sig== "FALSE") {
    categories<-append(categories, "con")
  }else
    categories<-append(categories, "und")
}


test<-append(test,sum(categories == "cis"))
test<-append(test,sum(categories == "trans")) 
test<-append(test,sum(categories == "ct")) 
test<-append(test,sum(categories == "comp")) 
test<-append(test,sum(categories == "und")) 
test<-append(test,sum(categories == "con"))
print(test)
## [1]   87   16   11   37 1000 3802

Section 6: Plotting

genesDE <- cbind(genesDE, categories)
genesDE <-as.data.frame(genesDE)
rownames(genesDE)<-genesDE[,1]

##plots using edgeR colculation: 
geneDEfull<-genesDE
genesDE<-genesDE[!(genesDE$categories=="und"),]

#removes genes 
geneDEnoc<-genesDE[! rownames(genesDE) %in% cdegenes2toss$x,]

if (cond == "CO2"){
  #write.table(genesDE, "co2_cistrans.txt", quote = F, row.names = F )
  co2genes<-rownames(genesDE)
  write.table(co2genes, "control_signal.txt", quote = F, row.names = F )
  genesDE2<-genesDE
  co2ct<-cbind(genesDE2$Row.names, as.character(genesDE2$categories))
  #write.table(co2ct, "co2_ct.txt", row.names = F, quote = F )
  ii<-"Control Samples"
}  else if (cond == "SMAR"){
  controlsig<-read.table("control_signal.txt", header = T)
  genesDE2<-genesDE[!rownames(genesDE) %in% controlsig$x,]
  write.table(genesDE2, "smar_cistrans.txt", quote = F, row.names = F )
  smarct<-cbind(genesDE2$Row.names, as.character(genesDE2$categories))
  #write.table(smarct, "smar_ct.txt", row.names = F, quote = F )
  #write.table(genesDE, "smar_ct_unfilt.txt", row.names = F, quote = F )
  ii<-"Immune responsive cis and trans effects in S. marcescens"
}  else if (cond == "EFAE"){
  controlsig<-read.table("control_signal.txt", header = T)
  genesDE2<-genesDE[!rownames(genesDE) %in% controlsig$x,]
  write.table(genesDE2, "efae_cistrans.txt", quote = F, row.names = F )
  efaect<-cbind(genesDE2$Row.names, as.character(genesDE2$categories))
  #write.table(efaect, "efae_ct.txt", row.names = F, quote = F )
  #write.table(genesDE, "efae_ct_unfilt.txt", row.names = F, quote = F )
  ii<-"Immune responsive cis and trans effects in E. faecalis"
}

if (cond == "CO2"){
  i<-"Cis and Trans effects in control"
}  else if (cond == "SMAR"){
  i<-"Cis and Trans effects in S. marcescens"
}  else if (cond == "EFAE"){
  i<-"Cis and Trans effects in E. faecalis"
}

print(test)
## [1]   87   16   11   37 1000 3802
###FUll data
ggplot(data= genesDE, aes(y=FLogFC,x=PLogFC, colour=categories)) +
  geom_point(size = 4, alpha = .7) +
  geom_abline(color = "blue") +
  geom_hline(yintercept = 0, color = "red") +
  geom_vline(xintercept = 0) +
  xlab("Log2 Ratio Parents(B6/A4)") +
  ylab("Log2 Ratio F1-hybrid (B6/A4)") +
  ggtitle(i)+
  #ggtitle("Control")+
  ylim(-12.5,12.5) +
  xlim(-12.5,12.5) +
  theme(axis.title.x = element_text(size = 20), axis.title.y = element_text(size = 20), axis.text.x = element_text(size= 16), axis.text.y = element_text(size= 16)) +
  scale_color_manual(breaks = c("cis","comp","con","ct","trans","und"), values=c("blue", "green", "yellow","purple", "red", "grey" ))

genesDE<-genesDE[!(genesDE$categories=="con"),]

##ONLY genes that dont show cis or trans signal in fatbody control samples
if (cond == "CO2"){
  ggplot(data= genesDE2, aes(y=FLogFC,x=PLogFC, colour=categories)) +
  geom_point(size = 3, alpha = .7) +
  geom_abline(color = "blue") +
  geom_hline(yintercept = 0, color = "red") +
  geom_vline(xintercept = 0) +
  xlab("Log2 Ratio Parents(B6/A4)") +
  ylab("Log2 Ratio F1-hybrid (B6/A4)") +
  ggtitle(ii)+
  theme_bw()+
  ylim(-10,10) +
  xlim(-10,10) +
  scale_color_manual(breaks = c("cis", "trans", "ct", "und", "comp", "con"), values=c("blue",  "red", "purple",  "grey", "green", "yellow"))
}  else if (cond == "SMAR"){
  ggplot(data= genesDE2, aes(y=FLogFC,x=PLogFC, colour=categories)) +
  geom_point(size = 3, alpha = .7) +
  geom_abline(color = "blue") +
  geom_hline(yintercept = 0, color = "red") +
  geom_vline(xintercept = 0) +
  xlab("Log2 Ratio Parents(B6/A4)") +
  ylab("Log2 Ratio F1-hybrid (B6/A4)") +
  ggtitle(ii)+
  theme_bw()+
  ylim(-10,10) +
  xlim(-10,10) +
  scale_color_manual(breaks = c("cis", "trans", "ct", "und", "comp", "con"), values=c("blue", "red", "purple",  "grey", "green","yellow"))
}  else if (cond == "EFAE"){
  ggplot(data= genesDE2, aes(y=FLogFC,x=PLogFC, colour=categories)) +
  geom_point(size = 3, alpha = .7) +
  geom_abline(color = "blue") +
  geom_hline(yintercept = 0, color = "red") +
  geom_vline(xintercept = 0) +
  xlab("Log2 Ratio Parents(B6/A4)") +
  ylab("Log2 Ratio F1-hybrid (B6/A4)") +
  ggtitle(ii)+
  theme_bw()+
  ylim(-10,10) +
  xlim(-10,10) +
  scale_color_manual(breaks = c("cis", "trans", "ct", "und", "comp", "con"), values=c("blue", "red", "purple",  "grey", "green","yellow"))
}
## Warning: Removed 7 rows containing missing values (geom_point).

sum(genesDE2$categories == "cis") 
## [1] 87
sum(genesDE2$categories == "trans") 
## [1] 16
sum(genesDE2$categories == "ct")
## [1] 11
sum(genesDE2$categories == "comp") 
## [1] 37
sum(genesDE2$categories == "und") 
## [1] 0
sum(genesDE2$categories == "con")
## [1] 3802

Section 7: Additional Comparisons

##Looking at expression Bias
#bias in cis genes 
ec_A4<-esig[esig$categories=='cis' & is.negative(esig$PLogFC),]
ec_B6<-esig[esig$categories=='cis' & is.positive(esig$PLogFC),]
sc_A4<-ssig[ssig$categories=='cis' & is.negative(ssig$PLogFC),]
sc_B6<-ssig[ssig$categories=='cis' & is.positive(ssig$PLogFC),]
#bias in trans genes
et_A4<-esig[esig$categories=='trans' & is.negative(esig$PLogFC),]
et_B6<-esig[esig$categories=='trans' & is.positive(esig$PLogFC),]
st_A4<-ssig[ssig$categories=='trans' & is.negative(ssig$PLogFC),]
st_B6<-ssig[ssig$categories=='trans' & is.positive(ssig$PLogFC),]

##looking at overlap. 
es_sig<-merge(esig,ssig, by=1, all=T)
es_sig$categories.x<-as.character(es_sig$categories.x)
es_sig$categories.y<-as.character(es_sig$categories.y)
es_sig<-es_sig[complete.cases(es_sig),]
paste("Genes showing cis or signal in both EFAE and SMAR=", nrow(es_sig))
## [1] "Genes showing cis or signal in both EFAE and SMAR= 86"
paste("Concordant Classifications ", nrow(es_sig[es_sig$categories.x == es_sig$categories.y,]))
## [1] "Concordant Classifications  71"
transig<-es_sig[es_sig$categories.x=="trans" | es_sig$categories.y=="trans",]
paste("Shared trans effects ", nrow(es_sig[es_sig$categories.x=="trans" & es_sig$categories.y=="trans",]))
## [1] "Shared trans effects  17"
###looking at category proportions
con_comp$T_samp <-factor(con_comp$T_samp, levels = con_comp$T_samp)
ccc<-con_comp[con_comp$Treatment=="CO2",]
cce<-con_comp[con_comp$Treatment=="EFAE",]
ccs<-con_comp[con_comp$Treatment=="SMAR",]

ccc$Per<-ccc$Value/sum(ccc$Value)
cce$Per<-cce$Value/sum(cce$Value)
ccs$Per<-ccs$Value/sum(ccs$Value)

ces<-rbind(ccc,cce,ccs)
ggplot(data=ces, aes(x=Signal, y=Per, fill=Treatment)) +
  geom_bar(stat="identity",position="dodge")+
  ylab("Percent")+
  scale_fill_manual(values=c("#4349D2", "#FFA373", "#CB66C2"))+
  ggtitle("Signal Comparison between Treatments")+
  theme_bw()+
  theme(axis.title.x = element_text(size = 20), 
        axis.title.y = element_text(size = 20), 
        axis.text.x = element_text(size= 16,angle = 90, vjust = 1, hjust = 1), 
        axis.text.y = element_text(size= 16),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

## Section 8: P Value Calculation

Test 1: Comparing proportions of trans-only genes’ genotype preference in expression A) Text “nearly twice as many [genes]were expressed more highly in the B6 genotype (48 genes) than in the A4 genotype (27 genes) [of the trans only genes in Efae infection]” So we are testing that the the proportion of genes that more highly expressed in B6 (47) is greater than those that more highly express A4 (26). Ie that the proportion of these genes is not equal (p=.5). Despite the function name this is actually a chi square test.

tesvt2<-prop.test(48,75,p=.5, alternative = 'greater')
tesvt2$p.value
## [1] 0.01046067

Test2 : Comparing proportions of each category for co2, efae, smar

  1. Text: “The control and Efae-infected samples had a greater proportion of cis-only genes than the Smar samples”
#control vs Efae
success<-c(87,174)
trails<-c(151,263)
test3aCE<-prop.test(success,trails)
9*test3aCE$p.value # Bonferoni correction ( we only do 9 tests)
## [1] 0.9320157
#control vs Smar
success<-c(87,79)
trails<-c(151,251)
test3aCS<-prop.test(success,trails)
9*test3aCS$p.value # Bonferoni correction ( we only do 9 tests)
## [1] 3.959068e-06
#Efae vs Smar
success<-c(174,79)
trails<-c(263,251)
test3aES<-prop.test(success,trails)
9*test3aES$p.value # Bonferoni correction ( we only do 9 tests)
## [1] 6.82653e-14
  1. Text: “three groups differ in the proportion of trans-only genes, with Smar-infected samples showing more than twice the proportion of trans signal, followed by Efae, and then the control samples”
#Control vs Efae
success<-c(16,75)
trails<-c(151,263)
test3bCE<-prop.test(success,trails)
9*test3bCE$p.value # Bonferoni correction ( we only do 9 tests)
## [1] 0.0003481742
#Control vs Smar
success<-c(16,149)
trails<-c(151,251)
test3bCS<-prop.test(success,trails)
9*test3bCS$p.value # Bonferoni correction ( we only do 9 tests)
## [1] 1.539558e-20
#Efae vs SMar
success<-c(75,149)
trails<-c(263,251)
test3bES<-prop.test(success,trails)
9*test3bES$p.value # Bonferoni correction ( we only do 9 tests)
## [1] 3.049017e-11
  1. text: “uninfected fat body showed much more compensatory signal than either infected sample”
success<-c(37,6)
trails<-c(151,263)
test3cCE<-prop.test(success,trails)
9*test3cCE$p.value # Bonferoni correction ( we only do 9 tests)
## [1] 2.922811e-11
success<-c(38,16)
trails<-c(151,251)
test3cCS<-prop.test(success,trails)
9*test3cCS$p.value # Bonferoni correction ( we only do 9 tests)
## [1] 1.797449e-06
success<-c(6,16)
trails<-c(263,251)
test3cES<-prop.test(success,trails)
9*test3cES$p.value # Bonferoni correction ( we only do 9 tests)
## [1] 0.3429412