require(ggplot2)
## Loading required package: ggplot2
require(reshape2)
## Loading required package: reshape2
theme_dbc <- theme_set(theme_gray())
theme_dbc <- theme_update(
panel.background = element_rect(fill = "white"),
panel.border = element_rect( colour = "black",fill=NA,size=2),
panel.grid.major = element_line(colour = "gray93",size=1),
panel.grid.minor = element_line(colour = "gray98",size=1),
strip.text.x = element_text(size=12,face='bold'),
axis.title = element_text(size=16),
strip.background = element_rect(colour="black", fill="white",size = 1),
axis.text = element_text(colour = "black",face="bold",size=16),
axis.ticks=element_line(color="black",size=2))
dat = read.table(file="~/Desktop/SherlockLab2/System_test/screen2_hiseq/seqlib11_analysis/bc_counts/all_counts.txt", sep="\t",header=TRUE,row.names=1,stringsAsFactors = FALSE)
dat2 = read.table(file = "~/Desktop/SherlockLab2/System_test/screen2_hiseq/seqlib11_analysis/bc_counts/recovered_counts.txt", sep="\t",header=TRUE,row.names=1,stringsAsFactors = FALSE)
dat5 = dat+dat2
rm(dat)
rm(dat2)
key = read.table(file="~/Desktop/SherlockLab2/System_test/screen2_hiseq/data_table_row_key.txt",sep="\t",header=TRUE,stringsAsFactors = FALSE)
no_libs = 14
Look at counts for present barcodes vs not present barcodes
for(i in 1:25){
p=ggplot()+geom_histogram(aes(x=dat5[which(key$present=="yes"),i]),alpha=0.5,binwidth=5,fill="green")+
geom_histogram(aes(x=dat5[which(key$present=="no"),i]),alpha=0.5,binwidth=5,fill="blue")+ylim(0,500)+ggtitle(names(dat5)[i])
print(p)
}
Look at Batch 1 start vs Batch 2 start
ggplot(dat5,aes(x=Batch1T0,y=Batch2T0))+geom_point(alpha=0.1)+scale_x_log10()+scale_y_log10()+
geom_abline(color="red")
ggplot(dat5,aes(x=Batch1T0,y=Batch2T0))+geom_point(alpha=0.1)+
geom_abline(color="red")+geom_smooth(method="lm")
Look at coverage across libraries
ggplot(melt(dat5))+geom_boxplot(aes(x=variable,y=value,color=variable))+
theme(legend.position="none",axis.text.x=element_text(angle=60,hjust=1))
## No id variables; using all as measure variables
Look at representation of each query guide in starting pool
sub = dat5
sub$category = key$category
sub$query = key$query
sub = sub[which(key$present=="yes"),]
for(i in 1:14){
temp = sub[,c(i,26,27)]
names(temp)[1]="count"
p=ggplot(temp)+geom_boxplot(aes(x=query,y=count,color=query))+ggtitle(names(dat5)[i])+
theme(legend.position="none",axis.text.x = element_text(angle = 60, hjust = 1))+scale_y_log10()
print(p+facet_wrap(~category))
print(names(dat5)[i])
print(length(which(temp$count==0)))
print(rownames(sub)[which(temp$count==0)])
}
## Warning: Removed 7 rows containing non-finite values (stat_boxplot).
## [1] "Batch1T0"
## [1] 7
## [1] "PRE7g7-RFC5-NRg-2" "PRE4g9-RFC5-NRg-2" "COG3g1-RFC5-NRg-2"
## [4] "SED5g5-RFC5-NRg-2" "DIP2g5-RFC5-NRg-2" "TIF6g8-RFC5-NRg-2"
## [7] "SAP30g7-RFC5-NRg-2"
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## [1] "YPDR1T1"
## [1] 4
## [1] "PRE7g7-RFC5-NRg-2" "PRE4g9-RFC5-NRg-2" "COG3g1-RFC5-NRg-2"
## [4] "DIP2g5-RFC5-NRg-2"
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## [1] "YPDR1T2"
## [1] 4
## [1] "PRE7g7-RFC5-NRg-2" "SED5g5-RFC5-NRg-2" "TIF6g8-RFC5-NRg-2"
## [4] "SAP30g7-RFC5-NRg-2"
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## [1] "YPDR1T3"
## [1] 3
## [1] "PRE7g7-RFC5-NRg-2" "PRE4g9-RFC5-NRg-2" "COG3g1-RFC5-NRg-2"
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## [1] "YPDR2T1"
## [1] 1
## [1] "DIP2g5-RFC5-NRg-2"
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## [1] "YPDR2T2"
## [1] 4
## [1] "PRE7g7-RFC5-NRg-2" "PRE4g9-RFC5-NRg-2" "COG3g1-RFC5-NRg-2"
## [4] "DIP2g5-RFC5-NRg-2"
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## [1] "YPDR2T3"
## [1] 6
## [1] "PRE4g9-RFC5-NRg-2" "COG3g1-RFC5-NRg-2" "SED5g5-RFC5-NRg-2"
## [4] "GET2g2-CCT8-TRg-2" "DIP2g5-RFC5-NRg-2" "TIF6g8-RFC5-NRg-2"
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).
## [1] "YPDR3T1"
## [1] 5
## [1] "PRE7g7-RFC5-NRg-2" "PRE4g9-RFC5-NRg-2" "DIP2g5-RFC5-NRg-2"
## [4] "TIF6g8-RFC5-NRg-2" "SAP30g7-RFC5-NRg-2"
## Warning: Removed 7 rows containing non-finite values (stat_boxplot).
## [1] "YPDR3T2"
## [1] 7
## [1] "PRE7g7-RFC5-NRg-2" "PRE4g9-RFC5-NRg-2" "SED5g5-RFC5-NRg-2"
## [4] "GET2g2-CCT8-TRg-2" "DIP2g5-RFC5-NRg-2" "TIF6g8-RFC5-NRg-2"
## [7] "SAP30g7-RFC5-NRg-2"
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## [1] "YPDR3T3"
## [1] 3
## [1] "PRE7g7-RFC5-NRg-2" "GET2g2-CCT8-TRg-2" "DIP2g5-RFC5-NRg-2"
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## [1] "YPDR4T1"
## [1] 1
## [1] "PRE4g9-RFC5-NRg-2"
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## [1] "YPDR4T2"
## [1] 4
## [1] "PRE7g7-RFC5-NRg-2" "COG3g1-RFC5-NRg-2" "SED5g5-RFC5-NRg-2"
## [4] "TIF6g8-RFC5-NRg-2"
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).
## [1] "YPDR4T3"
## [1] 5
## [1] "PRE7g7-RFC5-NRg-2" "DIP2g5-CDC42-TRg-6" "DIP2g5-RFC5-NRg-2"
## [4] "TIF6g8-RFC5-NRg-2" "YLR050Cg1-RFC5-NRg-2"
## Warning: Removed 7 rows containing non-finite values (stat_boxplot).
## [1] "Batch2T0"
## [1] 7
## [1] "PRE7g7-RFC5-NRg-2" "PRE4g9-RFC5-NRg-2" "COG3g1-RFC5-NRg-2"
## [4] "DIP2g5-RFC5-NRg-2" "TIF6g8-RFC5-NRg-2" "YLR050Cg1-RFC5-NRg-2"
## [7] "SAP30g7-RFC5-NRg-2"
Calculate and plot chimera subtraction
#store what BC1, BC2, and DBC is represented on each line of dat5
all_BC1 = key$query
all_BC2 = key$array
all_DBC = rownames(dat5)
#make data frame to store frequencies for each of BC1s in each samples
BC1_freqs = data.frame(matrix(nrow=length(unique(all_BC1)),ncol=no_libs))
names(BC1_freqs) = names(dat5)[1:no_libs]
BC1_freqs$BC1_name = unique(all_BC1)
#loop through BC1s and find total frequencies in each of the samples
for (i in 1:dim(BC1_freqs)[1]){
for (j in 1:(dim(BC1_freqs)[2]-1)){
BC1_freqs[i,j] = sum(dat5[which(all_BC1==BC1_freqs$BC1_name[i]),j])
}
}
#Make data frame to store frequencies for each of the BC2s in samples
BC2_freqs = data.frame(matrix(nrow=length(unique(all_BC2)),ncol=no_libs))
names(BC2_freqs)=names(dat5)[1:no_libs]
BC2_freqs$BC2_names = unique(all_BC2)
#loop through BC2s and find total frequencies in each of the samples
for (i in 1:dim(BC2_freqs)[1]){
for (j in 1:(dim(BC2_freqs)[2]-1)){
BC2_freqs[i,j] = sum(dat5[which(all_BC2==BC2_freqs$BC2_names[i]),j])
}
}
#Function to calculate expected number of counts at the given time point for each DBC
get_expected = function(sample){
print(names(dat5)[sample])
temp = rep(NA,dim(dat5)[1])
#calculate expected counts for each DBC at each time point
for (i in 1:dim(dat5)[1]){
#expected frequency of DBC based on two single mutants
BC1_i = which(BC1_freqs$BC1_name==all_BC1[i])
BC1_count = as.numeric(BC1_freqs[BC1_i,sample])
BC2_i = which(BC2_freqs$BC2_name==all_BC2[i])
BC2_count = as.numeric(BC2_freqs[BC2_i,sample])
temp[i]=BC1_count * BC2_count
}
return(temp)
}
#Calculate expectation for all counts in data matrix
dat5_expected = data.frame(matrix(nrow=dim(dat5)[1],ncol=no_libs))
names(dat5_expected)=names(dat5)[1:no_libs]
rownames(dat5_expected)=rownames(dat5)
for(i in 1:no_libs){
dat5_expected[,i] = get_expected(i)
}
## [1] "Batch1T0"
## [1] "YPDR1T1"
## [1] "YPDR1T2"
## [1] "YPDR1T3"
## [1] "YPDR2T1"
## [1] "YPDR2T2"
## [1] "YPDR2T3"
## [1] "YPDR3T1"
## [1] "YPDR3T2"
## [1] "YPDR3T3"
## [1] "YPDR4T1"
## [1] "YPDR4T2"
## [1] "YPDR4T3"
## [1] "Batch2T0"
Plot data before and after subtracting chimeric reads for each time point in each pool and save new count tables
subtract_BG = function(temp,column){
#fit line to non-existent DBCs
fit_non = coef(lm(Observed~Expected+0,data=temp,presence=="no"))
print("slope/total counts (before chimera removal)/total counts (after chimera removal)")
print(unlist(fit_non[1]))
#make data frame with just strains that exsit in the pool
sub_temp = temp[which(temp$presence == "yes"),]
sub_temp$norm = NA
#plot before correction
p = ggplot(temp)+geom_point(aes(x=Expected,y=Observed,shape=presence,color=category),alpha=0.2)+
theme(legend.position="none")+ylim(0,6000)+
geom_abline(slope=fit_non[1])#+xlim(0,4e10)
print(p+ggtitle(paste(names(dat5)[column]," Before")))
#loop through each existent strain and correct for chimeras by subtracting non-existent value for that x-value
for (i in 1:dim(sub_temp)[1]){
sub_temp$norm[i]=sub_temp$Observed[i]-(as.numeric(fit_non[1])*sub_temp$Expected[i])
}
#print how many counts were subtracted
print(sum(sub_temp$Observed))
print(sum(sub_temp$norm))
#return normalized data for fitness calculation
return(sub_temp)
}
#make dataframes to store bc counts before and after chimera removal
#only include strains which exist in the pool
dat5_unnorm = dat5[-which(key$present=="no"),1:no_libs]
dat5_norm = data.frame(matrix(nrow=dim(dat5_unnorm)[1],ncol=no_libs))
names(dat5_norm)=names(dat5_unnorm)
rownames(dat5_norm) = rownames(dat5_unnorm)
#loop through each library and subtract chimeras, plot data after subtraction
for (column in 1:no_libs){
print(names(dat5)[column])
temp = data.frame(dat5[,column],dat5_expected[,column],key$present,key$category)
names(temp)=c("Observed","Expected","presence","category")
rownames(temp)=rownames(dat5)
temp2=subtract_BG(temp,column)
p = ggplot()+geom_point(aes(x=temp$Expected,y=temp$Observed,color=temp$presence),alpha=0.2)+
geom_point(aes(x=temp2$Expected,y=temp2$norm),shape=1,alpha=0.1)+
theme(legend.position="none")+ylim(0,4000)+xlim(0,max(temp2$Expected))+
xlab("Expected")+ylab("Observed")
print(p+ggtitle(paste(names(dat5)[column],"After")))
dat5_norm[,column]=temp2$norm
}
## [1] "Batch1T0"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 1.470344e-09
## [1] 6764283
## [1] 6697975
## Warning: Removed 7 rows containing missing values (geom_point).
## [1] "YPDR1T1"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 5.546826e-09
## [1] 9239402
## [1] 8772261
## Warning: Removed 4 rows containing missing values (geom_point).
## [1] "YPDR1T2"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 1.339322e-08
## [1] 6706965
## [1] 6114818
## Warning: Removed 5 rows containing missing values (geom_point).
## [1] "YPDR1T3"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 9.850094e-09
## [1] 6376256
## [1] 5990272
## Warning: Removed 3 rows containing missing values (geom_point).
## [1] "YPDR2T1"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 8.874103e-09
## [1] 7027968
## [1] 6595381
## Warning: Removed 2 rows containing missing values (geom_point).
## [1] "YPDR2T2"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 7.592738e-09
## [1] 8332955
## [1] 7815110
## Warning: Removed 4 rows containing missing values (geom_point).
## [1] "YPDR2T3"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 3.467408e-09
## [1] 8552540
## [1] 8308616
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 32 rows containing missing values (geom_point).
## [1] "YPDR3T1"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 1.249243e-08
## [1] 6912390
## [1] 6322898
## Warning: Removed 5 rows containing missing values (geom_point).
## [1] "YPDR3T2"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 5.038186e-09
## [1] 6529006
## [1] 6318387
## Warning: Removed 7 rows containing missing values (geom_point).
## [1] "YPDR3T3"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 7.204287e-09
## [1] 7708560
## [1] 7295702
## Warning: Removed 17 rows containing missing values (geom_point).
## Warning: Removed 14 rows containing missing values (geom_point).
## [1] "YPDR4T1"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 2.623798e-08
## [1] 6893068
## [1] 5656734
## Warning: Removed 2 rows containing missing values (geom_point).
## [1] "YPDR4T2"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 1.714573e-08
## [1] 6436305
## [1] 5735235
## Warning: Removed 4 rows containing missing values (geom_point).
## [1] "YPDR4T3"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 1.67254e-08
## [1] 7056021
## [1] 6243978
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## [1] "Batch2T0"
## [1] "slope/total counts (before chimera removal)/total counts (after chimera removal)"
## Expected
## 1.217239e-09
## [1] 7147676
## [1] 7086267
## Warning: Removed 7 rows containing missing values (geom_point).
Calculate fitness!!!