temp2$gene_pair = temp$gene_pair[match(rownames(temp2),temp$strain)]
#record percent of unique gene pairs showing a significant interactions in at least one condition
length(unique(temp2$gene_pair[-which(is.na(temp2$gene_pair))]))
length(unique(temp2$gene_pair[which(temp2$num_pos>0|temp2$num_neg>0)]))/length(unique(temp2$gene_pair[-which(is.na(temp2$gene_pair))]))*100
#remove SAP30 containing pairs and calculate again
temp3 = temp2[-which(temp2$query=="SAP30g7"),]
length(unique(temp3$gene_pair[-which(is.na(temp3$gene_pair))]))
length(unique(temp3$gene_pair[which(temp3$num_pos>0|temp3$num_neg>0)]))/length(unique(temp3$gene_pair[-which(is.na(temp3$gene_pair))]))*100
#Number of GIs detected in YPD24hr also detected in YPD48hr (and fold increase in magnitude of GI)
#ypd24_gis = which(abs(temp2$YPD24hr)>(2*gi_sd+gi_mean))
ypd24_gis = which(temp2$YPD24hr > zthresh_upper | temp2$YPD24hr < zthresh_lower)
#ypd48_gis = which(abs(temp2$YPD48hr)>(2*gi_sd+gi_mean))
ypd48_gis = which(temp2$YPD48hr > zthresh_upper | temp2$YPD48hr < zthresh_lower)
length(ypd24_gis)
length(which(ypd24_gis%in%ypd48_gis))
216/252*100
sub = temp2[ypd24_gis[which(ypd24_gis%in%ypd48_gis)],]
mean(abs(sub$YPD48hr)/abs(sub$YPD24hr))
length(which(ypd24_gis%in%ypd48_gis))/length(ypd24_gis)
length(which(ypd24_gis%in%ypd48_gis))/length(ypd24_gis)*100
#record guide names for guide sequences contained in both query plasmids and starting pool
rep_array_names=c("PRE7-TRg-7","PRE4-NRg-9","PRE4-TRg-3","RPN5-NRg-1","COG3-TRg-1","SED5-TRg-5","SEC22-NRg-8","IMP4-TRg-6","DIP2-TRg-5","PWP2-NRg-2","TIF6-NRg-8","RPF1-TRg-3","MAK16-TRg-1")
rep_query_names=c("PRE7g7","PRE4g9","PRE4g3","RPN5g1","COG3g1","SED5g5","SEC22g2","IMP4g6","DIP2g5","PWP2g2_BC1","TIF6g8","RPF1g3","MAK16g1","PWP2g2_BC2")
#combine all GI scores into one table
temp=data.frame(ypd24_doubles$gi_score,
ypd48_doubles$gi_score,
ypeg_doubles$gi_score,
ypd37_doubles$gi_score,
ura_doubles$gi_score)
names(temp)=c("YPD24hr","YPD48hr","YPEG","YPD37","SCURA")
temp$strain = ypd24_doubles$strain
temp$query = ypd24_doubles$query
temp$array = ypd24_doubles$array
temp$query_target = gsub("g.+","",temp$query)
temp$array_target = gsub("-.+","",temp$array)
temp$gene_pair = paste(temp$query_target,temp$array_target,sep="_")
#remove rows where the gene targets are the same
temp = temp[-which(temp$query_target==temp$array_target),]
#re-label gene_pair for reverse orientation replicates
temp$gene_pair2 = temp$gene_pair
rep_gene_targets = unique(gsub("-.+","",rep_array_names))
for(i in 1:(length(rep_gene_targets)-1)){
for(j in (i+1):length(rep_gene_targets)){
old_label = paste(rep_gene_targets[j],rep_gene_targets[i],sep="_")#get gene pair labels to replace
new_label = paste(rep_gene_targets[i],rep_gene_targets[j],sep="_")#get label to replace old label
temp$gene_pair2[which(temp$gene_pair==old_label)]=new_label
}
}
print(length(unique(temp$gene_pair)))
print(length(unique(temp$gene_pair2)))
#make new data frame to record mean and sd of GI scores across replicate strains
#require at least 3 replicate strains for this analysis
rep_dat = subset(data.frame(table(temp$gene_pair2)),Freq>2)
rep_dat = cbind(rep_dat,data.frame(matrix(nrow=dim(rep_dat)[1],ncol=15)))
names(rep_dat)=c("gene_pair","nstrains",
"nest.ypd24","nest.ypd48","nest.ypeg","nest.ypd37","nest.ura",
"mean.ypd24","mean.ypd48","mean.ypeg","mean.ypd37","mean.ura",
"sd.ypd24","sd.ypd48","sd.ypeg","sd.ypd37","sd.ura")
#loop through each gene pair and record gi score data from across strains and conditions
for(i in 1:dim(rep_dat)[1]){
sub = subset(temp,gene_pair2 == rep_dat$gene_pair[i])[,1:5]
rep_dat[i,3:7]=apply(sub,2,function(x)length(which(!is.na(x))))
rep_dat[i,8:12]=apply(sub,2,function(x)mean(x,na.rm=TRUE))
rep_dat[i,13:17]=apply(sub,2,function(x)sd(x,na.rm=TRUE))
}
rownames(rep_dat)=rep_dat$gene_pair
#re-format for heatmap plotting
rep_dat_l = rep_dat[,c(1:3,8,13)]
names(rep_dat_l)[3:5]=c("num_est","mean","sd")
rep_dat_l$condition = "ypd24"
temp4 = rep_dat[,c(1:2,4,9,14)]
names(temp4)[3:5]=c("num_est","mean","sd")
temp4$condition = "ypd48"
rep_dat_l = rbind(rep_dat_l,temp4)
temp4 = rep_dat[,c(1:2,5,10,15)]
names(temp4)[3:5]=c("num_est","mean","sd")
temp4$condition = "ypeg"
rep_dat_l = rbind(rep_dat_l,temp4)
temp4 = rep_dat[,c(1:2,6,11,16)]
names(temp4)[3:5]=c("num_est","mean","sd")
temp4$condition = "ypd37"
rep_dat_l = rbind(rep_dat_l,temp4)
temp4 = rep_dat[,c(1:2,7,12,17)]
names(temp4)[3:5]=c("num_est","mean","sd")
temp4$condition = "ura"
rep_dat_l = rbind(rep_dat_l,temp4)
#set factor levels for conditions
rep_dat_l$condition = factor(rep_dat_l$condition,levels=c("ypd24","ypd48","ypeg","ypd37","ura"))
#set mean and sd to NA if there are fewer than 3 replicate measurments
rep_dat_l$mean[which(rep_dat_l$num_est<3)]=NA
rep_dat_l$sd[which(rep_dat_l$num_est<3)]=NA
head(rep_dat_l)
for(i in 1:dim(rep_dat_l)[1]){
rep_dat_l$lower.95[i]=rep_dat_l$mean[i]-(1.96*(rep_dat_l$sd[i]/sqrt(rep_dat_l$num_est[i])))
rep_dat_l$upper.95[i]=rep_dat_l$mean[i]+(1.96*(rep_dat_l$sd[i]/sqrt(rep_dat_l$num_est[i])))
}
head(rep_dat_l)
-0.06236267 + (1.96* (0.02971837/sqrt(3)))
-0.06236267 - (1.96* (0.02971837/sqrt(3)))
sqrt(3)
tail(rep_dat_l)
0.02184941 + (1.96 * (0.03048009/sqrt(3)))
0.02184941 - (1.96 * (0.03048009/sqrt(3)))
which(rep_dat_l$upper < (-gi_sd+gi_mean) | rep_dat_l$lower > (gi_sd + gi_mean))
head(rep_dat_l$upper < (-gi_sd+gi_mean) | rep_dat_l$lower > (gi_sd + gi_mean))
rep_dat_l[c(82,715,717),]
-gi_sd+gi_mean
gi_sd + gi_mean
rep_dat_l[c(6480,6482,6522),]
rep_dat_l$sig = (rep_dat_l$upper < (-gi_sd+gi_mean) | rep_dat_l$lower > (gi_sd + gi_mean))
rep_dat_l$sig2 = ""
rep_dat_l$sig2[which(rep_dat_l$sig==TRUE)]="*"
#generate list of unique gene pairs with a significant interaction in at least one condition
sig_pairs = unique(subset(rep_dat_l,sig==TRUE)$gene_pair)
print(length(sig_pairs))
#Make first heatmap for supplement including all gene pairs passing threshold
#first get order of guide pairs (clustered based on euclidean distant), also plot dendrogram
coldist=dist(subset(rep_dat,gene_pair%in%sig_pairs)[,8:12])
colclust = hclust(coldist,method="average")
colclust.d = as.dendrogram(colclust)
colorder=colclust$labels[colclust$order]
```
```{r fig.width=14,fig.height=6}
par(mar=c(10,4,4,2))
plot(colclust.d,edgePar=list(lwd=3))
par(mar=c(5,4,4,2))
```
```{r fig.width=8,fig.height=16}
#make heatmap
rep_dat_plot = subset(rep_dat_l,gene_pair%in%sig_pairs)
rep_dat_plot$gene_pair = factor(rep_dat_plot$gene_pair, levels = colorder)
print(qplot(x=condition, y=gene_pair, data=rep_dat_plot,
fill=mean,geom="tile")+
scale_fill_gradientn(colours=c("steelblue","black","goldenrod"),
values=rescale(c(-0.33,zthresh_lower,0,zthresh_upper,0.82)),
guide="colorbar",limits=c(-0.33,0.82))+
geom_text(aes(label=sig2),color="white")+
theme(axis.text.x=element_text(angle=120,hjust=1),axis.text.y=element_text(angle=180,hjust=0))+xlab("")+ylab(""))
rep_dat_l$sig = (rep_dat_l$upper < zthresh_lower | rep_dat_l$lower > zthresh_upper)
rep_dat_l$sig2 = ""
rep_dat_l$sig2[which(rep_dat_l$sig==TRUE)]="*"
sig_pairs = unique(subset(rep_dat_l,sig==TRUE)$gene_pair)
print(length(sig_pairs))
rep_dat_l$sig = (rep_dat_l$upper < (-gi_sd+gi_mean) | rep_dat_l$lower > (gi_sd+gi_mean))
rep_dat_l$sig2 = ""
rep_dat_l$sig2[which(rep_dat_l$sig==TRUE)]="*"
sig_pairs = unique(subset(rep_dat_l,sig==TRUE)$gene_pair)
print(length(sig_pairs))
#Make first heatmap for supplement including all gene pairs passing threshold
#first get order of guide pairs (clustered based on euclidean distant), also plot dendrogram
coldist=dist(subset(rep_dat,gene_pair%in%sig_pairs)[,8:12])
colclust = hclust(coldist,method="average")
colclust.d = as.dendrogram(colclust)
colorder=colclust$labels[colclust$order]
```
```{r fig.width=14,fig.height=6}
par(mar=c(10,4,4,2))
plot(colclust.d,edgePar=list(lwd=3))
par(mar=c(5,4,4,2))
```
```{r fig.width=8,fig.height=16}
#make heatmap
rep_dat_plot = subset(rep_dat_l,gene_pair%in%sig_pairs)
rep_dat_plot$gene_pair = factor(rep_dat_plot$gene_pair, levels = colorder)
print(qplot(x=condition, y=gene_pair, data=rep_dat_plot,
fill=mean,geom="tile")+
scale_fill_gradientn(colours=c("steelblue","black","goldenrod"),
values=rescale(c(-0.33,zthresh_lower,0,zthresh_upper,0.82)),
guide="colorbar",limits=c(-0.33,0.82))+
geom_text(aes(label=sig2),color="white")+
theme(axis.text.x=element_text(angle=120,hjust=1),axis.text.y=element_text(angle=180,hjust=0))+xlab("")+ylab(""))
#Make second heatmap for figure 2 excluding gene pairs with positive interaction including SAP30
#and gene pairs that did not have estimate in all 5 conditions
rem_pairs1 = rep_dat_l$gene_pair[which(is.na(rep_dat_l$mean))] #names of those with missing values
rem_pairs2 = temp$gene_pair2[which(temp$query_target=="SAP30"&temp$YPEG>(gi_sd+gi_mean))] #names of those with pos SAP30 (z score >1)
sig_pairs2 = sig_pairs[-which(sig_pairs%in%rem_pairs1|sig_pairs%in%rem_pairs2)]
print(length(sig_pairs2))
coldist=dist(subset(rep_dat,gene_pair%in%sig_pairs2)[,8:12])
colclust = hclust(coldist,method="average")
colclust.d = as.dendrogram(colclust)
colorder=colclust$labels[colclust$order]
```
```{r fig.width=10,fig.height=6}
par(mar=c(10,4,4,2))
plot(colclust.d,edgePar=list(lwd=3))
par(mar=c(5,4,4,2))
```
```{r fig.width=8,fig.height=11}
#make heatmap
rep_dat_plot = subset(rep_dat_l,gene_pair%in%sig_pairs2)
rep_dat_plot$gene_pair = factor(rep_dat_plot$gene_pair, levels = colorder)
print(qplot(x=condition, y=gene_pair, data=rep_dat_plot,
fill=mean,geom="tile")+
scale_fill_gradientn(colours=c("steelblue","black","goldenrod"),
values=rescale(c(-0.33,zthresh_lower,0,zthresh_upper,0.82)),
guide="colorbar",limits=c(-0.33,0.82))+
geom_text(aes(label=sig2),color="white")+
theme(axis.text.x=element_text(angle=120,hjust=1),axis.text.y=element_text(angle=180,hjust=0))+xlab("")+ylab(""))
require(ggplot2)
require(reshape2)
require(RColorBrewer)
require(mgcv)
theme_dbc <- theme_set(theme_gray())
theme_dbc <- theme_update(
panel.background = element_rect(fill = "white"),
panel.border = element_rect( colour = "black",fill=NA,size=2),
panel.grid.major = element_line(colour = "gray93",size=1),
panel.grid.minor = element_line(colour = "gray98",size=1),
strip.text.x = element_text(size=12,face='bold'),
axis.title = element_text(size=16),
strip.background = element_rect(colour="black", fill="white",size = 1),
axis.text = element_text(colour = "black",face="bold",size=16),
axis.ticks=element_line(color="black",size=2))
#Function to read in data
load_data = function(path_name,samples,dat_names){
no_samples = dim(samples)[1]
fsc_dat = data.frame(matrix(nrow=10000,ncol=no_samples))
names(fsc_dat)=samples[,2]
blu1_dat = data.frame(matrix(nrow=10000,ncol=no_samples))
names(blu1_dat)=samples[,2]
for(i in 1:dim(samples)[1]){
file_name = paste(path_name,samples[i,1],sep="")
sample_name = samples[[i,2]]
print(sample_name)
temp = read.table(file_name,sep=",",header=FALSE)
names(temp)=dat_names[,2]
#remove sample with max FSC value
rem = unique(c(which(temp[,2]==261621),which(temp[,2]<2000),which(temp[,3]<1000)))
print(length(rem))
if(length(rem>=1)){temp = temp[-rem,]}
#add data to matrix
size = dim(temp)[1]
fsc_dat[1:size,i]=temp[,2]
blu1_dat[1:size,i]=temp[,4]
}
return(cbind(fsc_dat,blu1_dat))
}
all_fsc = data.frame(matrix(nrow = 11500,ncol=61))
all_blu1 = data.frame(matrix(nrow = 11500,ncol=61))
#load data from 12/7/17
samples_exp12_7 = read.table("~/Desktop/SherlockLab2/gfp/Data-12-7-16/name_key.txt",sep="\t",header=FALSE,stringsAsFactors = FALSE)
dat_names_exp12_7 = read.table("~/Desktop/SherlockLab2/gfp/Data-12-7-16/files/file=1-2003.info",sep="\t",header=FALSE)
path_name_exp12_7 = "~/Desktop/SherlockLab2/gfp/Data-12-7-16/files/file="
temp = load_data(path_name_exp12_7,samples_exp12_7,dat_names_exp12_7)
names(temp)[c(3,10:18,38)]
names(temp)[c(42,49:57,77)]
names(temp)[c(3,10:18,38)]==names(temp)[c(42,49:57,77)]
fsc_temp = temp[,c(3,10:18,38)]
blu1_temp = temp[,c(42,49:57,77)]
no_samples = dim(fsc_temp)[2]
all_fsc[1:dim(fsc_temp)[1],1:no_samples]=fsc_temp
names(all_fsc)[1:no_samples]=names(fsc_temp)
all_blu1[1:dim(blu1_temp)[1],1:no_samples]=blu1_temp
names(all_blu1)[1:no_samples]=names(blu1_temp)
col_count = 1 + no_samples
#load data from 1/11/17
samples_exp1_11 = read.table("~/Desktop/SherlockLab2/gfp/Data-1-11-17/name_key.txt",sep="\t",header=FALSE,stringsAsFactors = FALSE)
dat_names_exp1_11 = read.table("~/Desktop/SherlockLab2/gfp/Data-1-11-17/files/file=2-2004.info",sep="\t",header=FALSE)
path_name_exp1_11 = "~/Desktop/SherlockLab2/gfp/Data-1-11-17/files/file="
temp = load_data(path_name_exp1_11,samples_exp1_11,dat_names_exp1_11)
names(temp)[c(5,8,11,15,16,25,38,39,42:45)]
names(temp)[c(50,53,56,60,61,70,83,84,87:90)]
names(temp)[c(5,8,11,15,16,25,38,39,42:45)]==names(temp)[c(50,53,56,60,61,70,83,84,87:90)]
fsc_temp = temp[,c(5,8,11,15,16,25,38,39,42:45)]
blu1_temp = temp[,c(50,53,56,60,61,70,83,84,87:90)]
no_samples = dim(fsc_temp)[2]
col_range = col_count:(col_count-1+no_samples)
all_fsc[1:dim(fsc_temp)[1],col_range]=fsc_temp
names(all_fsc)[col_range]=names(fsc_temp)
all_blu1[1:dim(blu1_temp)[1],col_range]=blu1_temp
names(all_blu1)[col_range]=names(blu1_temp)
col_count = col_count + no_samples
#load data from 1/25/17
samples_exp1_25 = read.table("~/Desktop/SherlockLab2/gfp/Data-1-25-17/name_key.txt",sep="\t",header=FALSE,stringsAsFactors = FALSE)
dat_names_exp1_25 = read.table("~/Desktop/SherlockLab2/gfp/Data-1-25-17/files/file=1-2003.info",sep="\t",header=FALSE)
path_name_exp1_25 = "~/Desktop/SherlockLab2/gfp/Data-1-25-17/files/file="
temp = load_data(path_name_exp1_25,samples_exp1_25,dat_names_exp1_25)
names(temp)[c(5:7,9:14,21:32)]
names(temp)[c(37:39,41:46,53:64)]
names(temp)[c(5:7,9:14,21:32)]==names(temp)[c(37:39,41:46,53:64)]
fsc_temp = temp[,c(5:7,9:14,21:32)]
blu1_temp = temp[,c(37:39,41:46,53:64)]
no_samples = dim(fsc_temp)[2]
col_range = col_count:(col_count-1+no_samples)
all_fsc[1:dim(fsc_temp)[1],col_range]=fsc_temp
names(all_fsc)[col_range]=names(fsc_temp)
all_blu1[1:dim(blu1_temp)[1],col_range]=blu1_temp
names(all_blu1)[col_range]=names(blu1_temp)
col_count = col_count + no_samples
#load data from 2/2/17
samples_exp2_2 = read.table("~/Desktop/SherlockLab2/gfp/Data-2-2-17/name_key.txt",sep="\t",header=FALSE,stringsAsFactors = FALSE)
dat_names_exp2_2 = read.table("~/Desktop/SherlockLab2/gfp/Data-2-2-17/files/file=1-2003.info",sep="\t",header=FALSE)
path_name_exp2_2="~/Desktop/SherlockLab2/gfp/Data-2-2-17/files/file="
temp = load_data(path_name_exp2_2,samples_exp2_2,dat_names_exp2_2)
names(temp)[c(5,8,10:11,18:19,26:29)]
names(temp)[c(34,37,39:40,47:48,55:58)]
names(temp)[c(5,8,10:11,18:19,26:29)]==names(temp)[c(34,37,39:40,47:48,55:58)]
fsc_temp = temp[,c(5,8,10:11,18:19,26:29)]
blu1_temp = temp[,c(34,37,39:40,47:48,55:58)]
no_samples = dim(fsc_temp)[2]
col_range = col_count:(col_count-1+no_samples)
all_fsc[1:dim(fsc_temp)[1],col_range]=fsc_temp
names(all_fsc)[col_range]=names(fsc_temp)
all_blu1[1:dim(blu1_temp)[1],col_range]=blu1_temp
names(all_blu1)[col_range]=names(blu1_temp)
col_count = col_count + no_samples
#load data from 2/14/17
samples_exp2_14 = read.table("~/Desktop/SherlockLab2/gfp/Data-2-14-17/name_key.txt",sep="\t",header=FALSE,stringsAsFactors = FALSE)
dat_names_exp2_14 = read.table("~/Desktop/SherlockLab2/gfp/Data-2-14-17/files/file=1-2004.info",sep="\t",header=FALSE)
path_name_exp2_14 = "~/Desktop/SherlockLab2/gfp/Data-2-14-17/files/file="
temp = load_data(path_name_exp2_14,samples_exp2_14,dat_names_exp2_14)
names(temp)[c(8,17:22)]
names(temp)[c(46,55:60)]
names(temp)[c(8,17:22)]==names(temp)[c(46,55:60)]
fsc_temp = temp[,c(8,17:22)]
blu1_temp = temp[,c(46,55:60)]
no_samples = dim(fsc_temp)[2]
col_range = col_count:(col_count-1+no_samples)
all_fsc[1:dim(fsc_temp)[1],col_range]=fsc_temp
names(all_fsc)[col_range]=names(fsc_temp)
all_blu1[1:dim(blu1_temp)[1],col_range]=blu1_temp
names(all_blu1)[col_range]=names(blu1_temp)
col_count = col_count + no_samples
#One example for RPN5 (Have 4 replicates)
names(all_blu1)[14] #MIA187 strain is the GFP RPN5 strain carrying the control guide
names(all_blu1)[21:22]
names(all_fsc)[14]
names(all_fsc)[21:22]
ggplot()+geom_point(aes(y=all_blu1[,14],x=all_fsc[,14]),alpha=0.6)+
geom_point(aes(y=all_blu1[,22],x=all_fsc[,22]),alpha=0.2,color="turquoise")+
geom_point(aes(y=all_blu1[,21],x=all_fsc[,21]),alpha=0.2,color="indianred2")+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("RPN5g1")
ggplot()+geom_point(aes(y=all_blu1[,14],x=all_fsc[,14]),alpha=0.6)+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("RPN5g1")
#One example for GET2g2 (have 3 total)
names(all_blu1)[c(11,5,2)]
names(all_fsc)[c(11,5,2)]
ggplot()+geom_point(aes(y=all_blu1[,11],x=all_fsc[,11]),alpha=0.6)+
geom_point(aes(y=all_blu1[,5],x=all_fsc[,5]),alpha=0.2,color="turquoise")+
geom_point(aes(y=all_blu1[,2],x=all_fsc[,2]),alpha=0.2,color="indianred2")+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("GET2g2")
ggplot()+geom_point(aes(y=all_blu1[,11],x=all_fsc[,11]),alpha=0.6)+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("GET2g2")
#One example for COG8g2 (have 4 total)
names(all_blu1)[c(12,16,15)] #MIA165 is COG8 gfp strain with non targeting control guide
names(all_fsc)[c(12,16,15)]
ggplot()+geom_point(aes(y=all_blu1[,12],x=all_fsc[,12]),alpha=0.6)+
geom_point(aes(y=all_blu1[,16],x=all_fsc[,16]),alpha=0.2,color="turquoise")+
geom_point(aes(y=all_blu1[,15],x=all_fsc[,15]),alpha=0.2,color="indianred2")+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("COG8g2")
ggplot()+geom_point(aes(y=all_blu1[,12],x=all_fsc[,12]),alpha=0.6)+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("COG8g2")
#One example for SAP30g7 (have 4 total)
names(all_blu1)[c(13,19,18)] #MIA183 is SAP30 gfp strain with non targeting control guide
names(all_fsc)[c(13,19,18)]
ggplot()+geom_point(aes(y=all_blu1[,13],x=all_fsc[,13]),alpha=0.6)+
geom_point(aes(y=all_blu1[,19],x=all_fsc[,19]),alpha=0.2,color="turquoise")+
geom_point(aes(y=all_blu1[,18],x=all_fsc[,18]),alpha=0.2,color="indianred2")+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("SAP30g7")
ggplot()+geom_point(aes(y=all_blu1[,13],x=all_fsc[,13]),alpha=0.6)+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("SAP30g7")
#One example for YCR016W (have 1 total)
names(all_blu1)[c(46,52,51)]
names(all_fsc)[c(46,52,51)]
ggplot()+geom_point(aes(y=all_blu1[,46],x=all_fsc[,46]),alpha=0.6)+
geom_point(aes(y=all_blu1[,52],x=all_fsc[,52]),alpha=0.2,color="turquoise")+
geom_point(aes(y=all_blu1[,51],x=all_fsc[,51]),alpha=0.2,color="indianred2")+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("YCR016Wg4")
ggplot()+geom_point(aes(y=all_blu1[,46],x=all_fsc[,46]),alpha=0.6)+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("YCR016Wg4")
#one example for YLR050C (have 1 total)
names(all_blu1)[c(45,48,47)]
names(all_fsc)[c(45,48,47)]
ggplot()+geom_point(aes(y=all_blu1[,45],x=all_fsc[,45]),alpha=0.6)+
geom_point(aes(y=all_blu1[,48],x=all_fsc[,48]),alpha=0.2,color="turquoise")+
geom_point(aes(y=all_blu1[,47],x=all_fsc[,47]),alpha=0.2,color="indianred2")+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("YLR050Cg1")
ggplot()+geom_point(aes(y=all_blu1[,45],x=all_fsc[,45]),alpha=0.6)+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("YLR050Cg1")
#One example for PRE4g3 (have 1 total)
names(all_blu1)[c(55,57,56)]
names(all_fsc)[c(55,57,56)]
ggplot()+geom_point(aes(y=all_blu1[,55],x=all_fsc[,55]),alpha=0.6)+
geom_point(aes(y=all_blu1[,57],x=all_fsc[,57]),alpha=0.2,color="turquoise")+
geom_point(aes(y=all_blu1[,56],x=all_fsc[,56]),alpha=0.2,color="indianred2")+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("PRE4g3")
ggplot()+geom_point(aes(y=all_blu1[,55],x=all_fsc[,55]),alpha=0.6)+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("PRE4g3")
#For PRE4 also show non-targeting plus and minus atc
names(all_blu1)[c(55,61,60)]
names(all_fsc)[c(55,61,60)]
ggplot()+geom_point(aes(y=all_blu1[,55],x=all_fsc[,55]),alpha=0.6)+
geom_point(aes(y=all_blu1[,61],x=all_fsc[,61]),alpha=0.2,color="turquoise")+
geom_point(aes(y=all_blu1[,60],x=all_fsc[,60]),alpha=0.2,color="indianred2")+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("PRE4 non-targeting")
#One example for PRE4g9 (have 1 total)
names(all_blu1)[c(55,59,58)]
names(all_fsc)[c(55,59,58)]
ggplot()+geom_point(aes(y=all_blu1[,55],x=all_fsc[,55]),alpha=0.6)+
geom_point(aes(y=all_blu1[,59],x=all_fsc[,59]),alpha=0.2,color="turquoise")+
geom_point(aes(y=all_blu1[,58],x=all_fsc[,58]),alpha=0.2,color="indianred2")+
scale_x_log10(limits=c(30000,3e5))+scale_y_log10(limits=c(20,10000))+
xlab("FSC")+ylab("BluFL1")+annotation_logticks(size=2)+ggtitle("PRE4g9")
return_resids = function(training_i, test_i, guide){
training = data.frame(all_fsc[,training_i],all_blu1[,training_i])
names(training)=c("FSC","BluFL1")
#subtract missing values and small cell sizes
training = training[-which(is.na(training$BluFL1)),] #remove rows with values of NA
training = training[-which(training$FSC<40000),] #remove small cells where data is sparse
#save model and use to predict fluorescence
mdl = gam(BluFL1~s(FSC,bs="cs"),data=training)
print(summary(mdl))
pBluFL1=predict(mdl, data.frame(FSC=training$FSC))
#calculate residuals
residuals = training$BluFL1-pBluFL1
print(head(residuals))
#get data for treated sample
temp = data.frame(all_fsc[,test_i],all_blu1[,test_i])
names(temp)=c("FSC","BluFL1")
#remove na values or small cell sizes
if(length(which(is.na(temp$FSC)))!=0){temp = temp[-which(is.na(temp$FSC)),]}
if(length(which(temp$FSC<40000))){temp = temp[-which(temp$FSC<40000),]}
#calculate residuals
pBluFL1_test = predict(mdl, data.frame(FSC=temp$FSC))
residuals_test = temp$BluFL1-pBluFL1_test
print(head(residuals_test))
#return training and test residuals
resids = data.frame(matrix(ncol=2,nrow=11820))
names(resids) = c(paste(guide,"resid_minus",sep="_"),paste(guide,"resid_plus",sep="_"))
print
resids[1:length(residuals),1]=residuals
resids[1:length(residuals_test),2]=residuals_test
return(resids)
}
all_resids = cbind(return_resids(16,15,"COG8g2"),
return_resids(5,2,"GET2g2"),
return_resids(57,56,"PRE4g3"),
return_resids(59,58,"PRE4g9"),
return_resids(22,21,"RPN5g1"),
return_resids(19,18,"SAP30g7"),
return_resids(52,51,"YCR016Wg4"),
return_resids(48,47,"YLR050Cg1"))
View(all_resids)
t.test(all_resids$COG8g2_resid_minus,all_resids$COG8g2_resid_plus)
med_resid = data.frame(apply(all_resids,2,function(x)median(x,na.rm=TRUE)))
names(med_resid) = c("median")
med_resid$sample = rownames(med_resid)
med_resid$xval=NA
temp = melt(all_resids)
temp$xval=NA
samples = unique(temp$variable)
for(i in 1:length(samples)){
temp$xval[which(temp$variable==samples[i])]=i
med_resid$xval[which(med_resid$sample==samples[i])]=i
}
ggplot(temp,aes(x=xval,y=value))+geom_hline()+ylim(-2000,1000)+
annotate("rect",xmin=0.5,xmax=2.5,ymin=-2000,ymax=1000,alpha=0.3)+
annotate("rect",xmin=4.5,xmax=6.5,ymin=-2000,ymax=1000,alpha=0.3)+
annotate("rect",xmin=8.5,xmax=10.5,ymin=-2000,ymax=1000,alpha=0.3)+
annotate("rect",xmin=12.5,xmax=14.5,ymin=-2000,ymax=1000,alpha=0.3)+
geom_violin(aes(fill=gsub(".+_","",variable),group=variable))+
scale_x_continuous(breaks=c(1.5,3.5,5.5,7.5,9.5,11.5,13.5,15.5),
labels=c("COG8g2","GET2g2","PRE4g3","PRE4g9",
"RPN5g1","SAP30g7","YCR016Wg4","YLR050Cg1"))+
theme(axis.text.x=element_text(angle=70,hjust=1))+ylab("")+xlab("")+
geom_point(data=med_resid,aes(x=xval,y=median),shape=95,size=20)+
scale_fill_manual(values=c("#00BFC4","#F8766D"))
cont_resids = cbind(return_resids(1,8,"GET2nt"),
return_resids(61,60,"PRE4nt"),
return_resids(54,53,"YCR016Wnt"),
return_resids(50,49,"YLR050Cnt"))
med_resid = data.frame(apply(cont_resids,2,function(x)median(x,na.rm=TRUE)))
names(med_resid) = c("median")
med_resid$sample = rownames(med_resid)
med_resid$xval=NA
temp = melt(cont_resids)
temp$xval=NA
samples = unique(temp$variable)
for(i in 1:length(samples)){
temp$xval[which(temp$variable==samples[i])]=i
med_resid$xval[which(med_resid$sample==samples[i])]=i
}
ggplot(temp,aes(x=xval,y=value))+geom_hline()+ylim(-2000,1000)+
annotate("rect",xmin=0.5,xmax=2.5,ymin=-2000,ymax=1000,alpha=0.3)+
annotate("rect",xmin=4.5,xmax=6.5,ymin=-2000,ymax=1000,alpha=0.3)+
geom_violin(aes(fill=gsub(".+_","",variable),group=variable))+
scale_x_continuous(breaks=c(1.5,3.5,5.5,7.5),
labels=c("GET2nt","PRE4nt","YCR016Wnt","YLR050Cnt"))+
theme(axis.text.x=element_text(angle=70,hjust=1))+ylab("")+xlab("")+
geom_point(data=med_resid,aes(x=xval,y=median),shape=95,size=20)+
scale_fill_manual(values=c("#00BFC4","#F8766D"))
t.test(all_resids$GET2g2_resid_minus,all_resids$GET2g2_resid_plus)
t.test(cont_resids$GET2nt_resid_minus,cont_resids$GET2nt_resid_plus)
t.test(cont_resids$GET2nt_resid_minus,cont_resids$GET2nt_resid_minus)
t.test(cont_resids$PRE4nt_resid_minus,cont_resids$PRE4nt_resid_plus)
t.test(cont_resids$YCR016Wnt_resid_minus,cont_resids$YCR016Wnt_resid_plus)
t.test(cont_resids$YLR050Cnt_resid_minus,cont_resids$YLR050Cnt_resid_plus)
