This script is the 4th of 4 scripts meant to process data in R. This script deals only Transcription Factor Binding Site data output by the Finding Instances of Motif Occurances program (FIMO).
Script is broken down into 4 main sections as follows:
Required files:
It is worth mentioning that we descided on the less stringent p-value threshold of p=.001 since it allowed us to detect the majority of previously described Binding sites.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
library(data.table)
library(schoolmath)
library(rlist)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.2
rm(list=ls())
#setwd("~/Desktop/Wunderlichlab/Cis_var_project/Vinay_collab")
#make a data frame for Binding site lengths
bsa4l<-read.table("fimo_a4_try2p001.tsv", header = T, stringsAsFactors = F)
bsb6l<-read.table("fimo_b6_try2p001.tsv", header = T, stringsAsFactors = F)
bsa4s<-read.table("fimo_a4_try2p0001.tsv", header = T, stringsAsFactors = F)
bsb6s<-read.table("fimo_b6_try2p0001.tsv", header = T, stringsAsFactors = F)
sengerbs<-read.table("senger_2004_bs.txt", header = T, stringsAsFactors = F)
immune_gene<-read.table("~/Desktop/Wunderlichlab/Cis_var_project/immunelist_LB_2019_cat.txt", header = T, stringsAsFactors = F)
#First we are going to look at the 5 immune genes from the Senger 2004 paper
tempimm<-immune_gene[immune_gene$Symbol %in% c("Dpt","Mtk","PGRP-SB1","Def"),1]
templist<-list(bsa4l,bsb6l,bsa4s, bsb6s)
#pull out ONLY the 4 immune genes
templist<-lapply(templist, function(x) x<-x[x$sequence_name %in% tempimm,])
genos<-c("a4","b6","a4","b6")
pthresh<-c(0.001,0.001,0.0001,0.0001)
for (i in 1:length(genos)){
templist[[i]]$genotype<-genos[i]
templist[[i]]$pthresh<-pthresh[i]
}
templist<-lapply(templist, function(x) cbind(x,paste(x$motif_id, x$genotype, x$pthresh, sep="_")))
templist<-lapply(templist, function(x) x<-x[,c(2,1,12,3,4:7)])
temp_comp<-as.data.frame(matrix(ncol=5,nrow = 0))
colnames(temp_comp)<-c("sequence_name","motif_id","paste(x$motif_id, x$genotype, x$pthresh, sep = \"_\")","start","stop")
for (i in 1:4){
tempsub<-templist[[i]]
temp_comp<-rbind(temp_comp,tempsub)
rm(tempsub)
}
colnames(temp_comp)<-c("Gene","TF","ID_tf","start","stop","strand","score","p.value")
temp_comp<-setcolorder(temp_comp,c("Gene","TF","ID_tf","start","stop","strand","score","p.value"))
sengerbs$genotype<-"iso1"
sengerbs$pthresh<-"none"
sengerbs$ID_tf<-paste(sengerbs$BS,sengerbs$genotype,sengerbs$pthresh,sep="_")
sengerbs$score<-18
sengerbs$p.value<-.0000008
tempsenger<-sengerbs[,c("Gene","BS","ID_tf","start_adj","stop_adj","strand","score","p.value")]
colnames(tempsenger)<-colnames(temp_comp)
temp_comp<-rbind(tempsenger,temp_comp)
temp_comp$ID_tf<-as.factor(temp_comp$ID_tf)
temp_comp$TF<-as.factor(temp_comp$TF)
#function 1 :
multioverlap<-function(ref){
ref$d1<-abs(ref$start-tempgenequery[k,c("stop")])
ref$d2<-abs(ref$stop-tempgenequery[k,c("start")])
#see if any values are worth keeping
if ( min(ref$d1)>thresh & min(ref$d2)>thresh ){
return("nonoverlapping")
} else if(nrow(ref[ref$d1==ref$d2,])>0) {
return(rownames(ref[ref$d1==ref$d2,]))
} else if (min(ref$d1) < min(ref$d2) & min(ref$d1)<thresh ){
if (nrow(ref[ref$d1== min(ref$d1),])>1){
ref$d3<-abs(ref$d1-ref$d2)
return(rownames(ref[ref$d3== min(ref$d3),]))
} else {
return(rownames(ref[ref$d1== min(ref$d1),]))
}
} else if (min(ref$d1) > min(ref$d2) & min(ref$d2)<thresh ){
if (nrow(ref[ref$d2== min(ref$d2),])>1){
ref$d3<-abs(ref$d1-ref$d2)
return(rownames(ref[ref$d3== min(ref$d3),]))
} else {
return(rownames(ref[ref$d2== min(ref$d2),]))
}
} else{
return("nonoverlapping")
}
}
#function2
isdup <- function (x) duplicated (x) | duplicated (x, fromLast = TRUE)
#lets compare stuff now!
hitlist<-list()
thresh<-15
for (i in 1:length(templist)){
tempdf<-templist[[i]]
colnames(tempdf)<-colnames(tempsenger)
tempgenelist<-unique(tempdf$Gene)
temp_qm_merged<-as.data.frame(matrix(ncol=16,nrow=0))
colnames(temp_qm_merged)<-c("Gene","TF","ID_tf","start","stop","strand","score","p.value","Gene","TF","ID_tf","start","stop","strand","score","p.value")
for (j in 1:length(tempgenelist)){
tempgenequery<-tempdf[tempdf$Gene==tempgenelist[j],] #query
tempgenematch<-tempsenger[tempsenger$Gene==tempgenelist[j],] #subject
# rm(tempgenequery,tempgenematch)
rowsmatched<-as.character()
for (k in 1:nrow(tempgenequery)){
#tempTS_strand_match<-tempgenematch[tempgenematch$TF==tempgenequery[k,c("TF")] & tempgenematch$strand==tempgenequery[k,c("strand")],]
tempTS_strand_match<-tempgenematch[tempgenematch$TF==tempgenequery[k,c("TF")] ,]
#no matches in the "ref set
if(nrow(tempTS_strand_match)==0){
tempTS_strand_match<-tempgenequery[k,]
tempTS_strand_match[,3:8]<-NA
temp_qm_comb<-cbind(tempTS_strand_match,tempgenequery[k,])
#one match in the ref set
} else if (nrow(tempTS_strand_match)>=1){
if(multioverlap(tempTS_strand_match)=="nonoverlapping"){
tempTS_strand_match<-tempgenequery[k,]
tempTS_strand_match[,3:8]<-NA
temp_qm_comb<-cbind(tempTS_strand_match,tempgenequery[k,])
}else {
tempTS_strand_match_sub<-tempTS_strand_match[rownames(tempTS_strand_match) ==multioverlap(tempTS_strand_match),]
temp_qm_comb<-cbind(tempTS_strand_match_sub,tempgenequery[k,])
rowsmatched<-append(rowsmatched,multioverlap(tempTS_strand_match))
}
}
temp_qm_merged<-rbind(temp_qm_merged,temp_qm_comb)
if (k==nrow(tempgenequery)){
temp_exclude_a<-tempgenematch[! rownames(tempgenematch) %in% rowsmatched,]
temp_exclude_b<-as.data.frame(matrix(ncol=8,nrow=nrow(temp_exclude_a)))
colnames(temp_exclude_b)<-c("Gene","TF","ID_tf","start","stop","strand","score","p.value")
temp_qm_comb<-cbind(temp_exclude_a,temp_exclude_b)
temp_qm_merged<-rbind(temp_qm_merged,temp_qm_comb)
}
#close of for loop3 TFBS iterating.
}
#close of for loop2 Gene subsetting
}
hitlist<-list.append(hitlist, temp_qm_merged)
#rm(tempsub)
#close of for loop 1 list sep
}
tfbs_list<-templist
rm(list=ls(pattern = "temp"))
tempnames<-c("rgene","rTF","rID_tf","rstart","rstop","rstrand","rscore","rp.value","Gene","TF","ID_tf","start","stop","strand","score","p.value")
tempsharedbs<-numeric()
temptotalbs<-numeric()
tempbsmiss<-numeric()
temprel<-numeric()
temprelmiss<-numeric()
tempsrp<-numeric()
tempsrpmiss<-numeric()
for (i in 1:length(hitlist)){
temphits<-hitlist[[i]]
colnames(temphits)<-tempnames
temptotalbs<-append(temptotalbs, sum(complete.cases(temphits$start)))
tempsharedbs<-append(tempsharedbs, sum(complete.cases(temphits)))
tempbsmiss<-append(tempbsmiss, 18-sum(complete.cases(temphits)))
temprelbs<-temphits[temphits$rTF=="rel"| temphits$TF=="rel",]
temprel<-append(temprel, sum(complete.cases(temprelbs)))
temprelmiss<-append(temprelmiss,11-sum(complete.cases(temprelbs)))
tempsrpbs<-temphits[temphits$rTF=="srp"| temphits$TF=="srp",]
tempsrp<-append(tempsrp,sum(complete.cases(tempsrpbs)))
tempsrpmiss<-append(tempsrpmiss,7-sum(complete.cases(tempsrpbs)))
}
bscomp<-as.data.frame(cbind(c("A4_p.001","B6_p.001","A4_p.0001","B6_p.0001"),temptotalbs, tempsharedbs, tempbsmiss,temprel,temprelmiss,tempsrp,tempsrpmiss))
bscomp1<-bscomp[,c("V1","temptotalbs", "tempsharedbs", "tempbsmiss")]
bscomp2<-bscomp[,c("V1","temprel", "temprelmiss", "tempsrp","tempsrpmiss")]
bscomp1<-gather(bscomp1,key = "gth", value = "n", temptotalbs:tempbsmiss)
## Warning: attributes are not identical across measure variables;
## they will be dropped
bscomp2<-gather(bscomp2,key = "gth", value = "n", temprel:tempsrpmiss)
## Warning: attributes are not identical across measure variables;
## they will be dropped
# function to tell of the a Binding sites are overlapping
bscomp1$n<-as.numeric(as.character(bscomp1$n))
bscomp2$n<-as.numeric(as.character(bscomp2$n))
BSCOMP2<-ggplot(bscomp2, aes(fill=`gth`, y=`n`, x=`V1`)) +
geom_bar(position="dodge", stat="identity")
BSCOMP2 +
ggtitle("REl and SRP Identified Binding \nSites by threshhold ")+
ylab("Number of Sites")+
xlab("Genotype and Pvalue threshold")+
scale_fill_manual(values=c("#01c74a", "#01792d","#03dafc", "#0284bc"),
name="Type of Site",
breaks=c("temprel", "temprelmiss", "tempsrp","tempsrpmiss"),
labels=c("Total matching REL sites\n Senger 2004", "Total missing REL sites\n Senger 2004", "Total matching SRP sites\n Senger 2004", "Total missing SRP sites\n Senger 2004"))+
theme_bw()
#looking at total TF Binding sites
##NOTE we descided to only consider genes showing signal in response to Efae and Smar Infection these genes have been filtered for Immune genes.
tempesig<-read.table("~/Desktop/Wunderlichlab/Cis_var_project/Cisvar7/DSPR_alig/Try9/efae_ct.txt", header = T, stringsAsFactors = F)
tempssig<-read.table("~/Desktop/Wunderlichlab/Cis_var_project/Cisvar7/DSPR_alig/Try9/smar_ct.txt", header = T, stringsAsFactors = F)
colnames(tempesig)[2]<-"EFAE"
colnames(tempssig)[2]<-"SMAR"
tempcombsig<-merge(tempesig,tempssig, by=1, all = T)
#looking at expression Bias
esig<-read.table("~/Desktop/Wunderlichlab/Cis_var_project/Cisvar7/DSPR_alig/Try9/efae_cistrans.txt", header = T)
ssig<-read.table("~/Desktop/Wunderlichlab/Cis_var_project/Cisvar7/DSPR_alig/Try9/smar_cistrans.txt", header = T)
esig$eA4high<-is.negative(esig$PLogFC)
esig<-esig[,c("Row.names","eA4high")]
ssig$sA4high<-is.negative(ssig$PLogFC)
ssig<-ssig[,c("Row.names","sA4high")]
sigsig<-merge(esig,ssig, by=1,all=T)
tempnewcat<-as.character()
for (i in 1:nrow(sigsig)){
temp<-sigsig[i,]
if (is.na(temp$eA4high)==TRUE){
tempnewcat<-append(tempnewcat,temp[1,c("sA4high")])
} else if (is.na(temp$sA4high)==TRUE){
tempnewcat<-append(tempnewcat,temp[1,c("eA4high")])
}else if (temp$eA4high == temp$sA4high){
tempnewcat<-append(tempnewcat,temp[1,c("eA4high")])
}else{
tempnewcat<-append(tempnewcat,"UNCLEAR")
}
}
sigsig<-cbind(sigsig,tempnewcat)
temp_class<-as.character()
for (i in 1:nrow(tempcombsig)){
templine<-tempcombsig[i,]
templine[is.na(templine)] = 0
templine <- data.frame(lapply(templine, function(x) {gsub("cis", "1", x)}))
templine <- data.frame(lapply(templine, function(x) {gsub("ct", "10", x)}))
templine <- data.frame(lapply(templine, function(x) {gsub("comp", "10", x)}))
templine <- data.frame(lapply(templine, function(x) {gsub("trans", "100", x)}))
templine$EFAE<-as.numeric(as.character(templine$EFAE))
templine$SMAR<-as.numeric(as.character(templine$SMAR))
templine$SUM<-templine$EFAE+templine$SMAR
if(templine$SUM<100){
temp_class<-append(temp_class, "cis")
}else if (templine$SUM==100 | templine$SUM==200){
temp_class<-append(temp_class, "trans")
}else if (templine$SUM%%100==10){
temp_class<-append(temp_class, "cis")
}else if(templine$SUM%%100==1){
temp_class<-append(temp_class, "exclude")
}
}
sum(temp_class=="exclude")
## [1] 3
sum(temp_class=="cis")
## [1] 219
sum(temp_class=="trans")
## [1] 199
tempcombsig<-cbind( tempcombsig, temp_class)
temp_a4cis<-bsa4l[bsa4l$sequence_name %in% tempcombsig[tempcombsig$temp_class=="cis",c("V1")],]
temp_a4trans<-bsa4l[bsa4l$sequence_name %in% tempcombsig[tempcombsig$temp_class=="trans",c("V1")],]
temp_b6cis<-bsb6l[bsb6l$sequence_name %in% tempcombsig[tempcombsig$temp_class=="cis",c("V1")],]
temp_b6trans<-bsb6l[bsb6l$sequence_name %in% tempcombsig[tempcombsig$temp_class=="trans",c("V1")],]
#templist<-list(temp_a4cis,temp_a4trans,temp_b6cis,temp_b6trans)
temp_data<-temp_a4cis
geno_cond<-"a4_cis"
gtf<-function(temp_data,geno_cond){
tempgenelist<-unique(temp_data$sequence_name)
relish<-as.numeric()
dorsal<-as.numeric()
serpent<-as.numeric()
creba<-as.numeric()
for(i in 1:length(tempgenelist)){
tempgene<-temp_data[temp_data$sequence_name==tempgenelist[i],]
relish<-append(relish, sum(tempgene$motif_id=="rel"))
dorsal<-append(dorsal, sum(tempgene$motif_id=="dl"))
serpent<-append(serpent, sum(tempgene$motif_id=="srp"))
creba<-append(creba, sum(tempgene$motif_id=="crebA"))
}
tempgenelist<-as.data.frame(cbind(tempgenelist, relish,dorsal,serpent,creba))
colnames(tempgenelist)<-c("Gene", paste(geno_cond,"rel", sep="_"),paste(geno_cond,"dl", sep="_"),paste(geno_cond,"srp", sep="_"),paste(geno_cond,"crebA", sep="_"))
newgenes<-gather(tempgenelist, key = "cond", value = "tfbs", paste(geno_cond,"rel", sep="_"):paste(geno_cond,"crebA", sep="_"))
newgenes$geno<-t(as.data.frame(strsplit(newgenes$cond,split = "_")))[,1]
newgenes$eff<-t(as.data.frame(strsplit(newgenes$cond,split = "_")))[,2]
newgenes$tf<-t(as.data.frame(strsplit(newgenes$cond,split = "_")))[,3]
newgenes$gene_tf<-paste(newgenes$Gene,newgenes$tf,sep="_")
return(newgenes)
}
allcis<-merge(gtf(temp_a4cis,"a4_cis"),gtf(temp_b6cis,"b6_cis"), by=7, all=T)
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Warning: attributes are not identical across measure variables;
## they will be dropped
alltrans<-merge(gtf(temp_a4trans,"a4_trans"),gtf(temp_b6trans,"b6_trans"), by=7, all=T)
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Warning: attributes are not identical across measure variables;
## they will be dropped
# a4 binding sites - b6 binding sites
allcis$diff<-as.numeric(allcis$tfbs.x) - as.numeric(allcis$tfbs.y)
alltrans$diff<-as.numeric(alltrans$tfbs.x) - as.numeric(alltrans$tfbs.y)
allisall<-rbind(allcis,alltrans)
allisall<-allisall[complete.cases(allisall),]
#function
#temp_data<-allisall
expression_direction<-function(tempdata){
temp_cats<-as.character()
for (i in 1:nrow(tempdata)){
templine<-as.character(tempdata[i, c("Gene.x")])
temp_cats<-append(temp_cats,as.character(sigsig[sigsig$Row.names==templine, c("tempnewcat")]))
}
return(temp_cats)
}
a4higher<-expression_direction(allisall)
allisall<-cbind(allisall,a4higher)
##gonna fill trhis upp with pvalues
temptfs<-c("all","rel","dl","srp","crebA")
tempcisdiff<-as.numeric()
temptransdiff<-as.numeric()
Figure4_A<-ggplot(allisall,aes(y=`eff.y`, x=`diff`,colour=`eff.y`))
tempmeans<-data.frame(eff.y=c("cis","trans"),diff= c(round(mean(allisall[allisall$eff.y=="cis" & allisall$diff!=0 ,c("diff")],na.rm=TRUE), digits=2),round(mean(allisall[allisall$eff.y=="trans" & allisall$diff!=0 ,c("diff")],na.rm=TRUE),digits = 2)))
tempcisdiff<-append(tempcisdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis",])/nrow(allisall[allisall$eff.y=="cis",]))
temptransdiff<-append(temptransdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans",])/nrow(allisall[allisall$eff.y=="trans",]))
Figure4_A+
geom_point()+
geom_jitter()+
theme_bw()+
labs(title="Differences in Total Identified Binding sites",
x="Number of Binding Sites\n(A4-B6)",
y="Gene Effect" )+
scale_color_manual(name="Gene Group",values=c("cis"='blue',"trans"='red'),labels = c("Any Cis","Only Trans"))+
stat_summary(fun.data="mean_sdl",geom="errorbar", color="black", width=0.2)+
stat_summary(fun="mean", geom="point", color="black")
#geom_point(data=tempmeans, color='black',shape=23)
Figure4_C<-ggplot(allisall[allisall$tf.x=='rel',] ,aes(y=`eff.y`, x=`diff`,colour=`eff.y`))
tempmeans<-data.frame(eff.y=c("cis","trans"),diff= c(round(mean(allisall[allisall$tf.x =="rel" & allisall$eff.y=="cis" & allisall$diff!=0 ,c("diff")],na.rm=TRUE), digits=2),round(mean(allisall[allisall$tf.x =="rel" & allisall$eff.y=="trans" & allisall$diff!=0 ,c("diff")],na.rm=TRUE),digits = 2)))
tempcisdiff<-append(tempcisdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis"& allisall$tf.x=='rel',])/nrow(allisall[allisall$eff.y=="cis" & allisall$tf.x=='rel',]))
temptransdiff<-append(temptransdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans"& allisall$tf.x=='rel',])/nrow(allisall[allisall$eff.y=="trans" & allisall$tf.x=='rel',]))
Figure4_C+
geom_point()+
geom_jitter()+
theme_bw()+
labs(title="Relish Only\nDifferences in Total Identified Binding sites",
x="Number of Binding Sites\n(A4-B6)",
y="Gene Effect")+
xlim(-11,6)+
scale_color_manual(name="Gene Group",values=c("cis"='blue',"trans"='red'),labels = c("Any Cis","Only Trans"))+
stat_summary(fun.data="mean_sdl",geom="errorbar", color="black", width=0.2)+
stat_summary(fun="mean", geom="point", color="black")
# geom_point(data=tempmeans, color='black',shape=23)
Figure4_B<-ggplot(allisall[allisall$tf.x=='dl',] ,aes(y=`eff.y`, x=`diff`,colour=`eff.y`))
tempmeans<-data.frame(eff.y=c("cis","trans"),diff= c(round(mean(allisall[allisall$tf.x =="dl" & allisall$eff.y=="cis" & allisall$diff!=0 ,c("diff")],na.rm=TRUE), digits=2),round(mean(allisall[allisall$tf.x =="dl" & allisall$eff.y=="trans" & allisall$diff!=0 ,c("diff")],na.rm=TRUE),digits = 2)))
tempcisdiff<-append(tempcisdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis"& allisall$tf.x=='dl',])/nrow(allisall[allisall$eff.y=="cis" & allisall$tf.x=='dl',]))
temptransdiff<-append(temptransdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans"& allisall$tf.x=='dl',])/nrow(allisall[allisall$eff.y=="trans" & allisall$tf.x=='dl',]))
Figure4_B+
geom_point()+
geom_jitter()+
theme_bw()+
labs(title="Dorsal Only\nDifferences in Total Identified Binding sites",
x="Number of Binding Sites\n(A4-B6)",
y="Gene Effect")+
xlim(-11,6)+
scale_color_manual(name="Gene Group",values=c("cis"='blue',"trans"='red'),labels = c("Any Cis","Only Trans"))+
stat_summary(fun.data="mean_sdl",geom="errorbar", color="black", width=0.2)+
stat_summary(fun="mean", geom="point", color="black")
## Warning: Removed 1 rows containing missing values (geom_point).
#geom_point(data=tempmeans, color='black',shape=23)
Figure4_D<-ggplot(allisall[allisall$tf.x=='srp',] ,aes(y=`eff.y`, x=`diff`,colour=`eff.y`))
tempmeans<-data.frame(eff.y=c("cis","trans"),diff= c(round(mean(allisall[allisall$tf.x =="srp" & allisall$eff.y=="cis" & allisall$diff!=0 ,c("diff")],na.rm=TRUE), digits=2),round(mean(allisall[allisall$tf.x =="srp" & allisall$eff.y=="trans" & allisall$diff!=0 ,c("diff")],na.rm=TRUE),digits = 2)))
tempcisdiff<-append(tempcisdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis"& allisall$tf.x=='srp',])/nrow(allisall[allisall$eff.y=="cis" & allisall$tf.x=='srp',]))
temptransdiff<-append(temptransdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans"& allisall$tf.x=='srp',])/nrow(allisall[allisall$eff.y=="trans" & allisall$tf.x=='srp',]))
Figure4_D+
geom_point()+
geom_jitter()+
theme_bw()+
labs(title="Serpent Only\nDifferences in Total Identified Binding sites",
x="Number of Binding Sites\n(A4-B6)",
y="Gene Effect",caption = "Ploted means values are 0 excluding")+
scale_color_manual(name="Gene Group",values=c("cis"='blue',"trans"='red'),labels = c("Any Cis","Only Trans"))+
xlim(-11,6)+
stat_summary(fun.data="mean_sdl",geom="errorbar", color="black", width=0.2)+
stat_summary(fun="mean", geom="point", color="black")
## Warning: Removed 1 rows containing missing values (geom_point).
#geom_point(data=tempmeans, color='black',shape=23)
Figure4_E<-ggplot(allisall[allisall$tf.x=='crebA',] ,aes(y=`eff.y`, x=`diff`,colour=`eff.y`))
tempmeans<-data.frame(eff.y=c("cis","trans"),diff= c(round(mean(allisall[allisall$tf.x =="crebA" & allisall$eff.y=="cis" & allisall$diff!=0 ,c("diff")],na.rm=TRUE), digits=2),round(mean(allisall[allisall$tf.x =="crebA" & allisall$eff.y=="trans" & allisall$diff!=0 ,c("diff")],na.rm=TRUE),digits = 2)))
tempcisdiff<-append(tempcisdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis"& allisall$tf.x=='crebA',])/nrow(allisall[allisall$eff.y=="cis" & allisall$tf.x=='crebA',]))
temptransdiff<-append(temptransdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans"& allisall$tf.x=='crebA',])/nrow(allisall[allisall$eff.y=="trans" & allisall$tf.x=='crebA',]))
Figure4_E+
geom_point()+
geom_jitter()+
theme_bw()+
labs(title="CrebA Only\nDifferences in Total Identified Binding sites",
x="Number of Binding Sites\n(A4-B6)",
y="Gene Effect",caption = "Ploted means values are 0 excluding")+
scale_color_manual(name="Gene Group",values=c("cis"='blue',"trans"='red'),labels = c("Any Cis","Only Trans"))+
xlim(-11,6)+
stat_summary(fun.data="mean_sdl",geom="errorbar", color="black", width=0.2)+
stat_summary(fun="mean", geom="point", color="black")
#geom_point(data=tempmeans, color='black',shape=23)
tempprop<-as.numeric()
tempprop<-append(tempprop,prop.test(as.matrix(data.frame(succ = c(nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis",]), nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans",]) ),fail = c(nrow(allisall[allisall$diff==0 & allisall$eff.y=="cis",]), nrow(allisall[allisall$diff==0 & allisall$eff.y=="trans",])))))$p.value)
tempprop<-append(tempprop,prop.test(as.matrix(data.frame(succ = c(nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis" & allisall$tf.x=='rel',]), nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans" & allisall$tf.x=='rel',]) ),fail = c(nrow(allisall[allisall$diff==0 & allisall$eff.y=="cis" & allisall$tf.x=='rel',]), nrow(allisall[allisall$diff==0 & allisall$eff.y=="trans" & allisall$tf.x=='rel',])))))$p.value)
tempprop<-append(tempprop,prop.test(as.matrix(data.frame(succ = c(nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis" & allisall$tf.x=='dl',]), nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans" & allisall$tf.x=='dl',]) ),fail = c(nrow(allisall[allisall$diff==0 & allisall$eff.y=="cis" & allisall$tf.x=='dl',]), nrow(allisall[allisall$diff==0 & allisall$eff.y=="trans" & allisall$tf.x=='dl',])))))$p.value)
tempprop<-append(tempprop,prop.test(as.matrix(data.frame(succ = c(nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis" & allisall$tf.x=='srp',]), nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans" & allisall$tf.x=='srp',]) ),fail = c(nrow(allisall[allisall$diff==0 & allisall$eff.y=="cis" & allisall$tf.x=='srp',]), nrow(allisall[allisall$diff==0 & allisall$eff.y=="trans" & allisall$tf.x=='srp',])))))$p.value)
tempprop<-append(tempprop,prop.test(as.matrix(data.frame(succ = c(nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis" & allisall$tf.x=='crebA',]), nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans" & allisall$tf.x=='crebA',]) ),fail = c(nrow(allisall[allisall$diff==0 & allisall$eff.y=="cis" & allisall$tf.x=='crebA',]), nrow(allisall[allisall$diff==0 & allisall$eff.y=="trans" & allisall$tf.x=='crebA',])))))$p.value)
propsum<-as.data.frame(cbind(temptfs,tempcisdiff,temptransdiff))
colnames(propsum)<-c("tfs","cis","trans")
propsum<-gather(propsum, key = "test", value = "proportion_non0",cis:trans)
## Warning: attributes are not identical across measure variables;
## they will be dropped
propsum$proportion_non0<-as.numeric(propsum$proportion_non0)
Figure4_F<-ggplot(propsum,aes(x=`tfs`, y=`proportion_non0`, fill=`test`))
Figure4_F+
geom_bar(position="dodge", stat="identity")+
labs(title = "Proportion of genes with non-zero\ndifferences in TFbinding site",x="Transcription Factor", y="proportion of non-zero differences", caption = "No significance found for using test of proportions")+
scale_fill_manual("legend", values = c("cis" = "blue","trans" = "red"))+
theme_bw()
## Section 3 : P-Value testing
Test 1: Testing for differences in the Variances of TFBS differences between cis and trans-only gene groups.
Text: “For all transcription factors except Dl (Figure 4A-E), the genes with cis effects did indeed show a broader distribution of difference than those with trans effects (all TFs: p = 8.8e-13, Rel: p = 2.9e-2, Srp: p = 7.1e-10, CrebA: p =1.5e-7 , F-test to compare distribution variances, Bonferoni corrected).”
tempvar<-as.numeric()
##total tfs
tempvar<-append(tempvar,var.test(allisall[allisall$eff.y=='cis',c("diff")],allisall[allisall$eff.y=='trans',c("diff")] )$p.value)
#rel
tempvar<-append(tempvar,var.test(allisall[allisall$eff.y=='cis'& allisall$tf.x=='rel' ,c("diff")],allisall[allisall$eff.y=='trans'& allisall$tf.x=='rel',c("diff")])$p.value)
#dl
tempvar<-append(tempvar,var.test(allisall[allisall$eff.y=='cis'& allisall$tf.x=='dl' ,c("diff")],allisall[allisall$eff.y=='trans'& allisall$tf.x=='dl',c("diff")] )$p.value)
#srp
tempvar<-append(tempvar,var.test(allisall[allisall$eff.y=='cis'& allisall$tf.x=='srp' ,c("diff")],allisall[allisall$eff.y=='trans'& allisall$tf.x=='srp',c("diff")] )$p.value)
#creba
tempvar<-append(tempvar,var.test(allisall[allisall$eff.y=='cis'& allisall$tf.x=='crebA' ,c("diff")],allisall[allisall$eff.y=='trans'& allisall$tf.x=='crebA',c("diff")] )$p.value)
##
pvaluesum<-as.data.frame(cbind(temptfs, tempvar))
colnames(pvaluesum)<-c("tfs","variance_test")
pvaluesum$variance_test<-as.numeric(as.character(pvaluesum$variance_test))
pvaluesum$variance_test<-pvaluesum$variance_test*5 #Bonferoni correction (5 tests)
pvaluesum
## tfs variance_test
## 1 all 3.019807e-13
## 2 rel 1.514177e-02
## 3 dl 4.922592e-01
## 4 srp 1.193268e-10
## 5 crebA 4.477930e-08
Its notable this the these results where not found to be significant but are included in the suplement.
###again but this time with scores
#function
stf<-function(temp_data,geno_cond){
tempgenelist<-unique(temp_data$sequence_name)
relish<-as.numeric()
dorsal<-as.numeric()
serpent<-as.numeric()
creba<-as.numeric()
for(i in 1:length(tempgenelist)){
tempgene<-temp_data[temp_data$sequence_name==tempgenelist[i],]
relish<-append(relish, sum(tempgene[tempgene$motif_id=="rel",c("score")]))
dorsal<-append(dorsal, sum(tempgene[tempgene$motif_id=="dl",c("score")]))
serpent<-append(serpent, sum(tempgene[tempgene$motif_id=="srp",c("score")]))
creba<-append(creba, sum(tempgene[tempgene$motif_id=="crebA",c("score")]))
}
tempgenelist<-as.data.frame(cbind(tempgenelist, relish,dorsal,serpent,creba))
colnames(tempgenelist)<-c("Gene", paste(geno_cond,"rel", sep="_"),paste(geno_cond,"dl", sep="_"),paste(geno_cond,"srp", sep="_"),paste(geno_cond,"crebA", sep="_"))
newgenes<-gather(tempgenelist, key = "cond", value = "tfbs", paste(geno_cond,"rel", sep="_"):paste(geno_cond,"crebA", sep="_"))
newgenes$geno<-t(as.data.frame(strsplit(newgenes$cond,split = "_")))[,1]
newgenes$eff<-t(as.data.frame(strsplit(newgenes$cond,split = "_")))[,2]
newgenes$tf<-t(as.data.frame(strsplit(newgenes$cond,split = "_")))[,3]
newgenes$gene_tf<-paste(newgenes$Gene,newgenes$tf,sep="_")
return(newgenes)
}
allcis<-merge(stf(temp_a4cis,"a4_cis"),stf(temp_b6cis,"b6_cis"), by=7, all=T)
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Warning: attributes are not identical across measure variables;
## they will be dropped
alltrans<-merge(stf(temp_a4trans,"a4_trans"),stf(temp_b6trans,"b6_trans"), by=7, all=T)
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Warning: attributes are not identical across measure variables;
## they will be dropped
# a4 binding sites - b6 binding sites
allcis$diff<-as.numeric(allcis$tfbs.x) - as.numeric(allcis$tfbs.y)
alltrans$diff<-as.numeric(alltrans$tfbs.x) - as.numeric(alltrans$tfbs.y)
allisall<-rbind(allcis,alltrans)
allisall<-allisall[complete.cases(allisall),]
a4higher<-expression_direction(allisall)
allisall<-cbind(allisall,a4higher)
tempvar<-as.numeric()
temptfs<-c("all","rel","dl","srp","crebA")
tempcisdiff<-as.numeric()
temptransdiff<-as.numeric()
tempprop<-as.numeric()
Sup_figure_4_A<-ggplot(allisall,aes(y=`eff.y`, x=`diff`,colour=`eff.y`))
tempmeans<-data.frame(eff.y=c("cis","trans"),diff= c(round(mean(allisall[allisall$eff.y=="cis" & allisall$diff!=0 ,c("diff")],na.rm=TRUE), digits=2),round(mean(allisall[allisall$eff.y=="trans" & allisall$diff!=0 ,c("diff")],na.rm=TRUE),digits = 2)))
tempcisdiff<-append(tempcisdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis",])/nrow(allisall[allisall$eff.y=="cis",]))
temptransdiff<-append(temptransdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans",])/nrow(allisall[allisall$eff.y=="trans",]))
Sup_figure_4_A+
geom_point()+
geom_jitter()+
theme_bw()+
labs(title="Differences in Total TFBS Score",
x="Number of Binding Sites\n(A4-B6)",
y="Gene Effect" )+
scale_color_manual(name="Gene Group",values=c("cis"='blue',"trans"='red'),labels = c("Any Cis","Only Trans"))+
stat_summary(fun.data="mean_sdl",geom="errorbar", color="black", width=0.2)+
stat_summary(fun="mean", geom="point", color="black")
#geom_point(data=tempmeans, color='black',shape=23)
Sup_figure_4_C<-ggplot(allisall[allisall$tf.x=='rel',] ,aes(y=`eff.y`, x=`diff`,colour=`eff.y`))
tempmeans<-data.frame(eff.y=c("cis","trans"),diff= c(round(mean(allisall[allisall$tf.x =="rel" & allisall$eff.y=="cis" & allisall$diff!=0 ,c("diff")],na.rm=TRUE), digits=2),round(mean(allisall[allisall$tf.x =="rel" & allisall$eff.y=="trans" & allisall$diff!=0 ,c("diff")],na.rm=TRUE),digits = 2)))
tempcisdiff<-append(tempcisdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis"& allisall$tf.x=='rel',])/nrow(allisall[allisall$eff.y=="cis" & allisall$tf.x=='rel',]))
temptransdiff<-append(temptransdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans"& allisall$tf.x=='rel',])/nrow(allisall[allisall$eff.y=="trans" & allisall$tf.x=='rel',]))
Sup_figure_4_C+
geom_point()+
geom_jitter()+
theme_bw()+
labs(title="Relish Only\nDifferences in Total TFBS Score",
x="Number of Binding Sites\n(A4-B6)",
y="Gene Effect")+
xlim(-120,60)+
scale_color_manual(name="Gene Group",values=c("cis"='blue',"trans"='red'),labels = c("Any Cis","Only Trans"))+
stat_summary(fun.data="mean_sdl",geom="errorbar", color="black", width=0.2)+
stat_summary(fun="mean", geom="point", color="black")
# geom_point(data=tempmeans, color='black',shape=23)
Sup_figure_4_B<-ggplot(allisall[allisall$tf.x=='dl',] ,aes(y=`eff.y`, x=`diff`,colour=`eff.y`))
tempmeans<-data.frame(eff.y=c("cis","trans"),diff= c(round(mean(allisall[allisall$tf.x =="dl" & allisall$eff.y=="cis" & allisall$diff!=0 ,c("diff")],na.rm=TRUE), digits=2),round(mean(allisall[allisall$tf.x =="dl" & allisall$eff.y=="trans" & allisall$diff!=0 ,c("diff")],na.rm=TRUE),digits = 2)))
tempcisdiff<-append(tempcisdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis"& allisall$tf.x=='dl',])/nrow(allisall[allisall$eff.y=="cis" & allisall$tf.x=='dl',]))
temptransdiff<-append(temptransdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans"& allisall$tf.x=='dl',])/nrow(allisall[allisall$eff.y=="trans" & allisall$tf.x=='dl',]))
Sup_figure_4_B+
geom_point()+
geom_jitter()+
theme_bw()+
labs(title="Dorsal Only\nDifferences in Total TFBS score",
x="Number of Binding Sites\n(A4-B6)",
y="Gene Effect")+
xlim(-120,60)+
scale_color_manual(name="Gene Group",values=c("cis"='blue',"trans"='red'),labels = c("Any Cis","Only Trans"))+
stat_summary(fun.data="mean_sdl",geom="errorbar", color="black", width=0.2)+
stat_summary(fun="mean", geom="point", color="black")
#geom_point(data=tempmeans, color='black',shape=23)
Sup_figure_4_D<-ggplot(allisall[allisall$tf.x=='srp',] ,aes(y=`eff.y`, x=`diff`,colour=`eff.y`))
tempmeans<-data.frame(eff.y=c("cis","trans"),diff= c(round(mean(allisall[allisall$tf.x =="srp" & allisall$eff.y=="cis" & allisall$diff!=0 ,c("diff")],na.rm=TRUE), digits=2),round(mean(allisall[allisall$tf.x =="srp" & allisall$eff.y=="trans" & allisall$diff!=0 ,c("diff")],na.rm=TRUE),digits = 2)))
tempcisdiff<-append(tempcisdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis"& allisall$tf.x=='srp',])/nrow(allisall[allisall$eff.y=="cis" & allisall$tf.x=='srp',]))
temptransdiff<-append(temptransdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans"& allisall$tf.x=='srp',])/nrow(allisall[allisall$eff.y=="trans" & allisall$tf.x=='srp',]))
Sup_figure_4_D+
geom_point()+
geom_jitter()+
theme_bw()+
labs(title="Serpent Only\nDifferences in TFBS score",
x="Number of Binding Sites\n(A4-B6)",
y="Gene Effect",caption = "Ploted means values are 0 excluding")+
scale_color_manual(name="Gene Group",values=c("cis"='blue',"trans"='red'),labels = c("Any Cis","Only Trans"))+
xlim(-120,60)+
stat_summary(fun.data="mean_sdl",geom="errorbar", color="black", width=0.2)+
stat_summary(fun="mean", geom="point", color="black")
#geom_point(data=tempmeans, color='black',shape=23)
Sup_figure_4_E<-ggplot(allisall[allisall$tf.x=='crebA',] ,aes(y=`eff.y`, x=`diff`,colour=`eff.y`))
tempmeans<-data.frame(eff.y=c("cis","trans"),diff= c(round(mean(allisall[allisall$tf.x =="crebA" & allisall$eff.y=="cis" & allisall$diff!=0 ,c("diff")],na.rm=TRUE), digits=2),round(mean(allisall[allisall$tf.x =="crebA" & allisall$eff.y=="trans" & allisall$diff!=0 ,c("diff")],na.rm=TRUE),digits = 2)))
tempcisdiff<-append(tempcisdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis"& allisall$tf.x=='crebA',])/nrow(allisall[allisall$eff.y=="cis" & allisall$tf.x=='crebA',]))
temptransdiff<-append(temptransdiff, nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans"& allisall$tf.x=='crebA',])/nrow(allisall[allisall$eff.y=="trans" & allisall$tf.x=='crebA',]))
Sup_figure_4_E+
geom_point()+
geom_jitter()+
theme_bw()+
labs(title="CrebA Only\nDifferences in Total TFBS score",
x="Number of Binding Sites\n(A4-B6)",
y="Gene Effect",caption = "Ploted means values are 0 excluding")+
scale_color_manual(name="Gene Group",values=c("cis"='blue',"trans"='red'),labels = c("Any Cis","Only Trans"))+
xlim(-120,60)+
stat_summary(fun.data="mean_sdl",geom="errorbar", color="black", width=0.2)+
stat_summary(fun="mean", geom="point", color="black")
#geom_point(data=tempmeans, color='black',shape=23)
tempprop<-append(tempprop,prop.test(as.matrix(data.frame(succ = c(nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis",]), nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans",]) ),fail = c(nrow(allisall[allisall$diff==0 & allisall$eff.y=="cis",]), nrow(allisall[allisall$diff==0 & allisall$eff.y=="trans",])))))$p.value)
tempprop<-append(tempprop,prop.test(as.matrix(data.frame(succ = c(nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis" & allisall$tf.x=='rel',]), nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans" & allisall$tf.x=='rel',]) ),fail = c(nrow(allisall[allisall$diff==0 & allisall$eff.y=="cis" & allisall$tf.x=='rel',]), nrow(allisall[allisall$diff==0 & allisall$eff.y=="trans" & allisall$tf.x=='rel',])))))$p.value)
tempprop<-append(tempprop,prop.test(as.matrix(data.frame(succ = c(nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis" & allisall$tf.x=='dl',]), nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans" & allisall$tf.x=='dl',]) ),fail = c(nrow(allisall[allisall$diff==0 & allisall$eff.y=="cis" & allisall$tf.x=='dl',]), nrow(allisall[allisall$diff==0 & allisall$eff.y=="trans" & allisall$tf.x=='dl',])))))$p.value)
tempprop<-append(tempprop,prop.test(as.matrix(data.frame(succ = c(nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis" & allisall$tf.x=='srp',]), nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans" & allisall$tf.x=='srp',]) ),fail = c(nrow(allisall[allisall$diff==0 & allisall$eff.y=="cis" & allisall$tf.x=='srp',]), nrow(allisall[allisall$diff==0 & allisall$eff.y=="trans" & allisall$tf.x=='srp',])))))$p.value)
tempprop<-append(tempprop,prop.test(as.matrix(data.frame(succ = c(nrow(allisall[allisall$diff!=0 & allisall$eff.y=="cis" & allisall$tf.x=='crebA',]), nrow(allisall[allisall$diff!=0 & allisall$eff.y=="trans" & allisall$tf.x=='crebA',]) ),fail = c(nrow(allisall[allisall$diff==0 & allisall$eff.y=="cis" & allisall$tf.x=='crebA',]), nrow(allisall[allisall$diff==0 & allisall$eff.y=="trans" & allisall$tf.x=='crebA',])))))$p.value)
tempprop
## [1] 0.1244081 1.0000000 0.8514442 0.0950432 0.2042293
propsum<-as.data.frame(cbind(temptfs,tempcisdiff,temptransdiff))
colnames(propsum)<-c("tfs","cis","trans")
propsum<-gather(propsum, key = "test", value = "proportion_non0",cis:trans)
## Warning: attributes are not identical across measure variables;
## they will be dropped
propsum$proportion_non0<-as.numeric(propsum$proportion_non0)
tempvar<-append(tempvar,var.test(allisall[allisall$eff.y=='cis',c("diff")],allisall[allisall$eff.y=='trans',c("diff")] )$p.value)
tempvar<-append(tempvar,var.test(allisall[allisall$eff.y=='cis'& allisall$tf.x=='rel' ,c("diff")],allisall[allisall$eff.y=='trans'& allisall$tf.x=='rel',c("diff")])$p.value)
tempvar<-append(tempvar,var.test(allisall[allisall$eff.y=='cis'& allisall$tf.x=='dl' ,c("diff")],allisall[allisall$eff.y=='trans'& allisall$tf.x=='dl',c("diff")] )$p.value)
tempvar<-append(tempvar,var.test(allisall[allisall$eff.y=='cis'& allisall$tf.x=='srp' ,c("diff")],allisall[allisall$eff.y=='trans'& allisall$tf.x=='srp',c("diff")] )$p.value)
tempvar<-append(tempvar,var.test(allisall[allisall$eff.y=='cis'& allisall$tf.x=='crebA' ,c("diff")],allisall[allisall$eff.y=='trans'& allisall$tf.x=='crebA',c("diff")] )$p.value)
Sup_figure_4_F<-ggplot(propsum,aes(x=`tfs`, y=`proportion_non0`, fill=`test`))
Sup_figure_4_F+
geom_bar(position="dodge", stat="identity")+
labs(title = "Proportion of genes with non-zero\ndifferences in TFBS score",x="Transcription Factor", y="proportion of non-zero differences", caption = "No significance found for using test of proportions")+
scale_fill_manual("legend", values = c("cis" = "blue","trans" = "red"))+
theme_bw()
##pvalues
pvaluesum<-as.data.frame(cbind(temptfs, tempvar))
colnames(pvaluesum)<-c("tfs","variance_test")
pvaluesum$variance_test<-as.numeric(as.character(pvaluesum$variance_test))
pvaluesum$variance_test<-pvaluesum$variance_test*5 #Bonferoni correction (5 tests)
pvaluesum
## tfs variance_test
## 1 all 1.276756e-12
## 2 rel 2.639975e-01
## 3 dl 2.750666e+00
## 4 srp 5.140333e-13
## 5 crebA 2.220446e-14