Load data and packages

require(ggplot2)
## Loading required package: ggplot2
require(reshape2)
## Loading required package: reshape2
require(RColorBrewer)
## Loading required package: RColorBrewer
require(igraph)
## Loading required package: igraph
require(scales)
## Loading required package: scales
theme_dbc <- theme_set(theme_gray())
theme_dbc <- theme_update(
  panel.background = element_rect(fill = "white"),
  panel.border = element_rect( colour = "black",fill=NA,size=2),
  panel.grid.major = element_line(colour = "gray93",size=1),
  panel.grid.minor = element_line(colour = "gray98",size=1),
  strip.text.x = element_text(size=12,face='bold'),
  axis.title = element_text(size=16),
  strip.background = element_rect(colour="black", fill="white",size = 1),
  axis.text = element_text(colour = "black",face="bold",size=16),
  axis.ticks=element_line(color="black",size=2))

#Load in corrected GI scores
ypd24_doubles=read.table(file="~/Desktop/SherlockLab2/manuscript_aug2017/code/final_fitness_GI_estimates/ypd24_short.txt",sep="\t",header=TRUE,stringsAsFactors = FALSE)
ypd48_doubles=read.table(file="~/Desktop/SherlockLab2/manuscript_aug2017/code/final_fitness_GI_estimates/ypd48_short.txt",sep="\t",header=TRUE,stringsAsFactors = FALSE)
ypd37_doubles=read.table(file="~/Desktop/SherlockLab2/manuscript_aug2017/code/final_fitness_GI_estimates/ypd37_short.txt",sep="\t",header=TRUE,stringsAsFactors = FALSE)
ypeg_doubles=read.table(file="~/Desktop/SherlockLab2/manuscript_aug2017/code/final_fitness_GI_estimates/ypeg_short.txt",sep="\t",header=TRUE,stringsAsFactors = FALSE)
ura_doubles=read.table(file="~/Desktop/SherlockLab2/manuscript_aug2017/code/final_fitness_GI_estimates/ura_short.txt",sep="\t",header=TRUE,stringsAsFactors = FALSE)

#load in array go annotations
array_go = read.table("~/Desktop/SherlockLab2/manuscript_aug2017/code/figures/GO_lists.txt",sep="\t",stringsAsFactors = FALSE,header=TRUE)

#get sd of gi_scores
temp=data.frame(ypd24_doubles$gi_score,
                ypd48_doubles$gi_score,
                ypeg_doubles$gi_score,
                ypd37_doubles$gi_score,
                ura_doubles$gi_score)
                
names(temp)=c("YPD24hr","YPD48hr","YPEG","YPD37","SCURA")

#calculate sd of interaction scores across all conditions
gi_sd = sd(melt(temp)$value,na.rm=TRUE)
## No id variables; using all as measure variables
gi_mean = mean(melt(temp)$value,na.rm=TRUE)
## No id variables; using all as measure variables
rm(temp)

#set thresholds for z score
zval = 2
zthresh_upper = (zval*gi_sd)+gi_mean
zthresh_lower = (-zval*gi_sd)+gi_mean
gi_sd
## [1] 0.0781284
gi_mean
## [1] 0.003516402
zthresh_upper
## [1] 0.1597732
zthresh_lower
## [1] -0.1527404
#record bioprocesses for query guides
proteosome = c("PRE7g7","PRE4g9","PRE4g3","RPN5g1")
secretory = c("COG3g1","SED5g5","SEC22g1","SEC22g2","COG8g2","GET2g2")
ribo = c("IMP4g6","DIP2g5","PWP2g2_BC1","PWP2g2_BC2","TIF6g8","RPF1g3","MAK16g1")

Make scaffold network use z score >2 in any condition use max gi score for edge weights

#zthresh=2

#put GI score data from each conditions into a single data frame
all_data = ypd24_doubles[,c(2,3,9)]
names(all_data)[3]="YPD24hr"
all_data$YPD48hr = ypd48_doubles[,9]
all_data$YPEG = ypeg_doubles[,9]
all_data$YPD37C = ypd37_doubles[,9]
all_data$SCURA = ura_doubles[,9]
all_data$num_pos = apply(all_data[,3:7],1,function(x) length(which(x > (zthresh_upper))))
all_data$num_neg = apply(all_data[,3:7],1,function(x) length(which(x < (zthresh_lower))))

#check how many were in each category
xtabs(~num_pos+num_neg,data=all_data)
##        num_neg
## num_pos     0     1     2     3     4     5
##       0 12892  1010   163    45    10     2
##       1   692    18     0     0     0     0
##       2   191     5     0     0     0     0
##       3   125     2     0     0     0     0
##       4    39     0     0     0     0     0
##       5     6     0     0     0     0     0
#save gene pairs where sign change occurs
dyn_sub=subset(all_data,num_pos!=0&num_neg!=0)
dim(dyn_sub)[1]#number of gene pairs with switch in sign
## [1] 25
#remove rows where change in sign occurs
sub_data = all_data[-which(all_data$num_pos>0&all_data$num_neg>0),]

#remove rows where no interaction was detected
sub_data = sub_data[-which(sub_data$num_pos==0&sub_data$num_neg==0),]

#record maximum interaction score for network drawing
sub_data$weight = apply(sub_data[,3:7],1,function(x)max(abs(x),na.rm=TRUE))


#record whether the interaction is condition specific
cond_data = subset(sub_data,num_pos==1|num_neg==1)#first subset to rows with just 1 GI
#remove rows with missing measurements and use stringent def of cond. spec (all other z scores < 1)
cond_data$num_na=apply(cond_data[,3:7],1,function(x)length(which(is.na(x))))
cond_data = cond_data[-which(cond_data$num_na!=0),1:9]
cond_data$stringent = apply(cond_data[,3:7],1,function(x)length(which(x > (gi_sd+gi_mean) | x < (-gi_sd+gi_mean))))
#cond_data$stringent_pos = apply(cond_data[,3:7],1,function(x)length(which(x > (gi_sd+gi_mean))))
#cond_data$stringent_neg = apply(cond_data[,3:7],1,function(x)length(which(x < (-gi_sd+gi_mean))))
cond_data = subset(cond_data,stringent==1)[,1:9]
#cond_data = subset(cond_data,(num_pos==1&stringent_pos==1)|(num_neg==1&stringent_neg==1))[,1:9]
sub_data$stringent = rownames(sub_data)%in%rownames(cond_data)


#format data for igraph plotting
#first data frame stores edge information
temp = sub_data[,c(1:2,8:11)]
table(temp$stringent)
## 
## FALSE  TRUE 
##  1829   454
names(temp)[1:2]=c("from","to")
temp$sign.label = "pos"
temp$sign.type = 1
temp$sign.label[which(temp$num_neg>0)]="neg"
temp$sign.type[which(temp$num_neg>0)]=2

table(temp$sign.label)
## 
##  neg  pos 
## 1230 1053
#second data frame stores node information
temp2 = data.frame(c(unique(temp$from),unique(temp$to)),
                   c(rep("query",length(unique(temp$from))),rep("array",length(unique(temp$to)))),
                   c(rep(1,length(unique(temp$from))),rep(2,length(unique(temp$to)))),
                   c(rep(5,length(unique(temp$from))),rep(2.5,length(unique(temp$to)))))
names(temp2)=c("id","guide.label","guide.type","guide.size")
temp2$names = gsub("-.+","",temp2$id)
temp2$names[which(temp2$guide.label=="array")]=NA
temp2$query_process = 6
temp2$query_process[which(temp2$id%in%proteosome)]=1
temp2$query_process[which(temp2$id%in%secretory)]=2
temp2$query_process[which(temp2$id%in%ribo)]=3
temp2$query_process[which(temp2$id=="SAP30g7")]=4
temp2$query_process[which(temp2$id%in%c("YCR016Wg4","YLR050Cg1"))]=5

#build igraph network object
net = graph.data.frame(d=temp,vertices=temp2,directed=F)

#fill in cosmetics of plot
colrs3 = c("tomato","cornflowerblue","darkseagreen","darkorchid3","gray35","white")
V(net)$color <- colrs3[V(net)$query_process]
E(net)$width=E(net)$weight*1
colrs2 <- c("goldenrod","blue","red","grey40")
E(net)$color = colrs2[E(net)$sign.type]

#calculate and save layout of network
set.seed(9099)
l <- layout.kamada.kawai(net,weights=E(net)$weight)

plot(net,vertex.label=NA, main="any",layout=l,vertex.size=V(net)$guide.size)

#plot subnetworks for YPD48h and YPEG conditions
#red edges are conditions specific
#all other detected GIS in the condition are grea
for(i in 4:5){
  #make data frame for containing edge information for igraph
  temp = sub_data[which(sub_data[,i] > zthresh_upper | sub_data[,i] < zthresh_lower),c(1,2,8,9,i,11)]
  table(temp$stringent)
  names(temp)[c(1:2,5)]=c("from","to","weight")
  temp$sign.type=4
  temp$sign.type[which(temp$stringent==TRUE)]=3
  temp$weight = abs(temp$weight)
  temp$weight[which(temp$stringent==TRUE)]=temp$weight[which(temp$stringent==TRUE)]*3
  
  #build network object and plot
  net = graph.data.frame(d=temp,vertices=temp2,directed=F)
  V(net)$color <- colrs3[V(net)$query_process]
  E(net)$width=E(net)$weight
  E(net)$color = colrs2[E(net)$sign.type]
  plot(net,main=names(sub_data)[i],layout=l,vertex.size=V(net)$guide.size,vertex.label=NA)
  
}

Key for network diagrams

ggplot()+geom_point(aes(x=1,y=1),color="tomato",size=12)+
  geom_point(aes(x=1.5,y=1),color="cornflowerblue",size=12)+
  geom_point(aes(x=2,y=1),color="darkseagreen",size=12)+
  geom_point(aes(x=2.5,y=1),color="darkorchid3",size=12)+
  geom_point(aes(x=3,y=1),color="gray35",size=12)+
  geom_point(aes(x=3.5,y=1),color="white",size=12)+xlim(0.5,4)+
  geom_point(aes(x=c(1,1.5,2,2.5,3,3.5),y=rep(1,6)),size=12,shape=1)+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

Barplot with line graphs number of guide pairs with significant gis in each condition

#subset all data to only guide pairs with measurements in all 5 conditions
temp2 = all_data
temp2$missing = apply(temp2[,3:7],1,function(x)length(which(is.na(x))))
temp2 = temp2[-which(temp2$missing>0),1:9]
temp2$stringent = rownames(temp2)%in%rownames(cond_data)
print(dim(temp2))
## [1] 12715    10
#record number of GIs in each condition, also note condition specific and cumulative total
int_cond = data.frame(matrix(nrow=5,ncol=8))
names(int_cond)=c("cond","neg_gi","pos_gi","pos_sap","pos_notsap","cond_spec","cumulative","both")
int_cond$cond = 1:5

count = 0
for(i in 3:7){
  count = count + 1
  int_cond$pos_gi[count]=length(which(temp2[,i] > zthresh_upper))#/dim(temp2)[1]*100
  int_cond$neg_gi[count]=length(which(temp2[,i] < zthresh_lower))#/dim(temp2)[1]*100
  sub = temp2[which(temp2[,i] > zthresh_upper),]
  int_cond$pos_sap[count] = length(which(sub$query=="SAP30g7"))#/dim(temp2)[1]*100
  int_cond$pos_notsap[count] = (dim(sub)[1]-length(which(sub$query=="SAP30g7")))#/dim(temp2)[1]*100
  
  sub2 = temp2[which(temp2[,i] > zthresh_upper | temp2[,i] < zthresh_lower),]
  if(count ==1){runtot = rownames(sub2)}
  if(count > 1){runtot = unique(c(runtot,rownames(sub2)))}
  int_cond$cumulative[count]=length(runtot)#/dim(temp2)[1]*100
  
  int_cond$cond_spec[count]=length(which(sub2$stringent==TRUE))#/dim(temp2)[1]*100
  int_cond$both[count]=dim(sub2)[1]
  
}

#make bar graph
ggplot(melt(int_cond[,-3],id.vars=c("cond","cumulative","cond_spec","both")))+
  geom_bar(aes(x=cond,y=value,fill=as.character(cond),alpha=variable),
           stat="identity",position="stack",color="black")+
  scale_fill_manual(values=c("#A3A500","#00B0F6","#E76BF3","#00BF7D","#F8766D"))+
  scale_alpha_manual(values=c(0.9,0.6,0.6))+
  geom_line(size=2,alpha=0.9,aes(x=cond,y=cumulative),color="grey19")+
  geom_point(size=8,alpha=0.9,aes(x=cond,y=cumulative),color="grey19")+
  geom_line(size=2,alpha=0.9,aes(x=cond,y=cond_spec),color="grey19")+
  geom_point(size=8,alpha=0.9,aes(x=cond,y=cond_spec),color="grey19",shape=17)+
  xlab("")+ylab("")+theme(axis.text.x=element_blank())

Make heatmaps for GI scores aggregated across strains carrying guides targeting the same gene pair

#record guide names for guide sequences contained in both query plasmids and starting pool
rep_array_names=c("PRE7-TRg-7","PRE4-NRg-9","PRE4-TRg-3","RPN5-NRg-1","COG3-TRg-1","SED5-TRg-5","SEC22-NRg-8","IMP4-TRg-6","DIP2-TRg-5","PWP2-NRg-2","TIF6-NRg-8","RPF1-TRg-3","MAK16-TRg-1")
rep_query_names=c("PRE7g7","PRE4g9","PRE4g3","RPN5g1","COG3g1","SED5g5","SEC22g2","IMP4g6","DIP2g5","PWP2g2_BC1","TIF6g8","RPF1g3","MAK16g1","PWP2g2_BC2")

#combine all GI scores into one table
temp=data.frame(ypd24_doubles$gi_score,
                ypd48_doubles$gi_score,
                ypeg_doubles$gi_score,
                ypd37_doubles$gi_score,
                ura_doubles$gi_score)
names(temp)=c("YPD24hr","YPD48hr","YPEG","YPD37","SCURA")
temp$strain = ypd24_doubles$strain
temp$query = ypd24_doubles$query
temp$array = ypd24_doubles$array
temp$query_target = gsub("g.+","",temp$query)
temp$array_target = gsub("-.+","",temp$array)
temp$gene_pair = paste(temp$query_target,temp$array_target,sep="_")

#remove rows where the gene targets are the same
temp = temp[-which(temp$query_target==temp$array_target),]

#re-label gene_pair for reverse orientation replicates
temp$gene_pair2 = temp$gene_pair
rep_gene_targets = unique(gsub("-.+","",rep_array_names))
for(i in 1:(length(rep_gene_targets)-1)){
  for(j in (i+1):length(rep_gene_targets)){
    old_label = paste(rep_gene_targets[j],rep_gene_targets[i],sep="_")#get gene pair labels to replace
    new_label = paste(rep_gene_targets[i],rep_gene_targets[j],sep="_")#get label to replace old label
    temp$gene_pair2[which(temp$gene_pair==old_label)]=new_label
  }
}
print(length(unique(temp$gene_pair)))
## [1] 7791
print(length(unique(temp$gene_pair2)))
## [1] 7725
#make new data frame to record mean and sd of GI scores across replicate strains
#require at least 3 replicate strains for this analysis
rep_dat = subset(data.frame(table(temp$gene_pair2)),Freq>2)
rep_dat = cbind(rep_dat,data.frame(matrix(nrow=dim(rep_dat)[1],ncol=15)))

names(rep_dat)=c("gene_pair","nstrains",
                 "nest.ypd24","nest.ypd48","nest.ypeg","nest.ypd37","nest.ura",
                 "mean.ypd24","mean.ypd48","mean.ypeg","mean.ypd37","mean.ura",
                 "sd.ypd24","sd.ypd48","sd.ypeg","sd.ypd37","sd.ura")

#loop through each gene pair and record gi score data from across strains and conditions
for(i in 1:dim(rep_dat)[1]){
  sub = subset(temp,gene_pair2 == rep_dat$gene_pair[i])[,1:5]
  rep_dat[i,3:7]=apply(sub,2,function(x)length(which(!is.na(x))))
  rep_dat[i,8:12]=apply(sub,2,function(x)mean(x,na.rm=TRUE))
  rep_dat[i,13:17]=apply(sub,2,function(x)sd(x,na.rm=TRUE))
  
  }
rownames(rep_dat)=rep_dat$gene_pair

#re-format for heatmap plotting
rep_dat_l = rep_dat[,c(1:3,8,13)]
names(rep_dat_l)[3:5]=c("num_est","mean","sd")
rep_dat_l$condition = "ypd24"

temp4 = rep_dat[,c(1:2,4,9,14)]
names(temp4)[3:5]=c("num_est","mean","sd")
temp4$condition = "ypd48"
rep_dat_l = rbind(rep_dat_l,temp4)

temp4 = rep_dat[,c(1:2,5,10,15)]
names(temp4)[3:5]=c("num_est","mean","sd")
temp4$condition = "ypeg"
rep_dat_l = rbind(rep_dat_l,temp4)

temp4 = rep_dat[,c(1:2,6,11,16)]
names(temp4)[3:5]=c("num_est","mean","sd")
temp4$condition = "ypd37"
rep_dat_l = rbind(rep_dat_l,temp4)

temp4 = rep_dat[,c(1:2,7,12,17)]
names(temp4)[3:5]=c("num_est","mean","sd")
temp4$condition = "ura"
rep_dat_l = rbind(rep_dat_l,temp4)

#set factor levels for conditions
rep_dat_l$condition = factor(rep_dat_l$condition,levels=c("ypd24","ypd48","ypeg","ypd37","ura"))

#set mean and sd to NA if there are fewer than 3 replicate measurments
rep_dat_l$mean[which(rep_dat_l$num_est<3)]=NA
rep_dat_l$sd[which(rep_dat_l$num_est<3)]=NA

#record whether estimates pass threshold of 95% CI across replicate strains 
#non overlapping with zscore of 1
for(i in 1:dim(rep_dat_l)[1]){
  rep_dat_l$lower.95[i]=rep_dat_l$mean[i]-(1.96*(rep_dat_l$sd[i]/sqrt(rep_dat_l$num_est[i])))
  rep_dat_l$upper.95[i]=rep_dat_l$mean[i]+(1.96*(rep_dat_l$sd[i]/sqrt(rep_dat_l$num_est[i])))
}
rep_dat_l$sig = (rep_dat_l$upper < (-gi_sd+gi_mean) | rep_dat_l$lower > (gi_sd+gi_mean))
rep_dat_l$sig2 = ""
rep_dat_l$sig2[which(rep_dat_l$sig==TRUE)]="*"

#generate list of unique gene pairs with a significant interaction in at least one condition
sig_pairs = unique(subset(rep_dat_l,sig==TRUE)$gene_pair)
print(length(sig_pairs))
## [1] 198
print(dim(rep_dat))#total gene pairs tested
## [1] 1711   17
#Make first heatmap for supplement including all gene pairs passing threshold
#first get order of guide pairs (clustered based on euclidean distant), also plot dendrogram
coldist=dist(subset(rep_dat,gene_pair%in%sig_pairs)[,8:12])
colclust = hclust(coldist,method="average")
colclust.d = as.dendrogram(colclust)
colorder=colclust$labels[colclust$order]
par(mar=c(10,4,4,2))
plot(colclust.d,edgePar=list(lwd=3))

par(mar=c(5,4,4,2))
#make heatmap
rep_dat_plot = subset(rep_dat_l,gene_pair%in%sig_pairs)
rep_dat_plot$gene_pair = factor(rep_dat_plot$gene_pair, levels = colorder)
print(qplot(x=condition, y=gene_pair, data=rep_dat_plot,
            fill=mean,geom="tile")+
        scale_fill_gradientn(colours=c("steelblue","black","goldenrod"),
                             values=rescale(c(-0.33,zthresh_lower,0,zthresh_upper,0.82)),
                             guide="colorbar",limits=c(-0.33,0.82))+
        geom_text(aes(label=sig2),color="white")+
        theme(axis.text.x=element_text(angle=120,hjust=1),axis.text.y=element_text(angle=180,hjust=0))+xlab("")+ylab(""))

#Make second heatmap for figure 2 excluding gene pairs with positive interaction including SAP30
#and gene pairs that did not have estimate in all 5 conditions
rem_pairs1 = rep_dat_l$gene_pair[which(is.na(rep_dat_l$mean))] #names of those with missing values
rem_pairs2 = temp$gene_pair2[which(temp$query_target=="SAP30"&temp$YPEG>(gi_sd+gi_mean))] #names of those with pos SAP30 (z score >1)
sig_pairs2 = sig_pairs[-which(sig_pairs%in%rem_pairs1|sig_pairs%in%rem_pairs2)]
print(length(sig_pairs2))
## [1] 131
coldist=dist(subset(rep_dat,gene_pair%in%sig_pairs2)[,8:12])
colclust = hclust(coldist,method="average")
colclust.d = as.dendrogram(colclust)
colorder=colclust$labels[colclust$order]
par(mar=c(10,4,4,2))
plot(colclust.d,edgePar=list(lwd=3))

par(mar=c(5,4,4,2))
#make heatmap
rep_dat_plot = subset(rep_dat_l,gene_pair%in%sig_pairs2)
rep_dat_plot$gene_pair = factor(rep_dat_plot$gene_pair, levels = colorder)
print(qplot(x=condition, y=gene_pair, data=rep_dat_plot,
            fill=mean,geom="tile")+
        scale_fill_gradientn(colours=c("steelblue","black","goldenrod"),
                             values=rescale(c(-0.33,zthresh_lower,0,zthresh_upper,0.82)),
                             guide="colorbar",limits=c(-0.33,0.82))+
        geom_text(aes(label=sig2),color="white")+
        theme(axis.text.x=element_text(angle=120,hjust=1),axis.text.y=element_text(angle=180,hjust=0))+xlab("")+ylab(""))

Print summary stats cited in manuscript for this section

#print summary stats for manuscript text record number of significant interactions
int_cond #print table of data by condition
##   cond neg_gi pos_gi pos_sap pos_notsap cond_spec cumulative both
## 1    1     51    216     185         31         1        267  267
## 2    2    260    402     269        133        40        700  662
## 3    3    626    677     311        366       365       1634 1303
## 4    4     80     68       9         59        11       1719  148
## 5    5     97    161      46        115        37       1855  258
int_cond$cumulative[5]/dim(temp2)[1]*100 #print percent of unique guide pairs with interaction in any condition
## [1] 14.58907
int_cond$cumulative[2]/dim(temp2)[1]*100 #print percent of unique guide pairs with interaction in either ypd condition
## [1] 5.505309
int_cond$both/dim(temp2)[1]*100 #print percent of guide pairs interacting in each condition
## [1]  2.099882  5.206449 10.247739  1.163980  2.029099
sum(int_cond$cond_spec)/dim(temp2)[1]*100 #print percent of condition specific pairs
## [1] 3.570586
(int_cond$cumulative[5]/dim(temp2)[1]*100)/(int_cond$cumulative[2]/dim(temp2)[1]*100) #print fold increasing using all 5 conditions versus just rich media
## [1] 2.65
#record gene pairs from temp to temp2
temp2$gene_pair = temp$gene_pair[match(rownames(temp2),temp$strain)]
#record percent of unique gene pairs showing a significant interactions in at least one condition
length(unique(temp2$gene_pair[-which(is.na(temp2$gene_pair))]))
## [1] 6841
length(unique(temp2$gene_pair[which(temp2$num_pos>0|temp2$num_neg>0)]))/length(unique(temp2$gene_pair[-which(is.na(temp2$gene_pair))]))*100
## [1] 20.65488
#remove SAP30 containing pairs and calculate again
temp3 = temp2[-which(temp2$query=="SAP30g7"),]
length(unique(temp3$gene_pair[-which(is.na(temp3$gene_pair))]))
## [1] 6391
length(unique(temp3$gene_pair[which(temp3$num_pos>0|temp3$num_neg>0)]))/length(unique(temp3$gene_pair[-which(is.na(temp3$gene_pair))]))*100
## [1] 16.93006
#Number of GIs detected in YPD24hr also detected in YPD48hr (and fold increase in magnitude of GI)
#ypd24_gis = which(abs(temp2$YPD24hr)>(2*gi_sd+gi_mean))
ypd24_gis = which(temp2$YPD24hr > zthresh_upper | temp2$YPD24hr < zthresh_lower)
#ypd48_gis = which(abs(temp2$YPD48hr)>(2*gi_sd+gi_mean))
ypd48_gis = which(temp2$YPD48hr > zthresh_upper | temp2$YPD48hr < zthresh_lower)
length(ypd24_gis)
## [1] 267
length(which(ypd24_gis%in%ypd48_gis))
## [1] 229
length(which(ypd24_gis%in%ypd48_gis))/length(ypd24_gis)*100
## [1] 85.76779
sub = temp2[ypd24_gis[which(ypd24_gis%in%ypd48_gis)],]
mean(abs(sub$YPD48hr)/abs(sub$YPD24hr))
## [1] 1.39651