Summary

This script is the 3rd of 4 scripts meant to process data in R. This script takes Ensemble Variant Effect Predictor outputs for analysis.

This script is broken up into 4 main sections

Sections:

  1. Analysis of SNP effects
  2. Plotting
  3. P value calculation
  4. Analysis of BLOSSUM scores

Required files : this script requires the use of 4 ensemble output files :

  1. VEP predictions for all expressed genes in the fat body : expressed.txt
  2. VEP predictions for deferentially expressed genes in the fat body in response to infection : detreated.txt
  3. VEP predictions for deferentially expressed immune genes in the fat body in response to infection : deimmune.txt
  4. VEP predictions for 22 potential trans scource genes : transcources.txt

Section 1: Analysis of SNP effects.

##load Libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rlang)
## Warning: package 'rlang' was built under R version 3.6.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
library(reshape2)
library(KSgeneral)

rm(list=ls())
#set directory 
#setwd("~/Desktop/Wunderlichlab/Cis_var_project/SNPperGene/SNPout/Ensembleout/")

#import all the tables:
names<-c("deimmune.txt","detreated.txt","exnonDEimm.txt","expressed.txt","transcources.txt")
#names<-as.character(strsplit(list.files(pattern="*.txt"), ".txt"))

myfullfiles = lapply(names, read.delim)

#rows that have "-" in the allele column generally have - on all the other columns so we remove them
myfullfiles <- lapply(myfullfiles, function(x) x[!x$Allele=="-",])
#we are going to assume that if they have the same location gene name and amino acid change that it doesnt matter if they are different isoforms. 
#HOWEVER this filtering DOES allow for snps that result if different amino acids depending on transcript
myfullfiles <- lapply(myfullfiles, function(x) unique(x[,c("Location", "Allele", "Consequence", "IMPACT","SYMBOL", "Gene", "Protein_position", "Amino_acids", "Codons", "BLOSUM62")]))

##just seeing what the types of sequence changes we get
####Where the sexy-sleek code ends#####
names<-c("deimmune","detreated","exnonDEimm","expressed","transcources")
consequences<-c("synonymous_variant","stop_retained_variant",
                "missense_variant","start_lost","stop_gained",
                "splice_region_variant")
con_cons<-data.frame(matrix(ncol = 0, nrow = length(consequences)))
for (i in 1:length(names)) {
  test<-as.data.frame(myfullfiles[i])
  temp<-integer()
  ##pull out
  for (ii in 1:length(consequences)){
    temp<-append(temp,sum(as.character(test$Consequence)==consequences[ii]))
  }
  temp<-as.data.frame(temp)
  con_cons<-cbind(con_cons, temp)
  rm(temp,test)
}

colnames(con_cons)<-names
rownames(con_cons)<-consequences

con_cons<-rbind(con_cons,
               colSums(con_cons[c("synonymous_variant","stop_retained_variant"),]))

con_cons<-rbind(con_cons,
                colSums(con_cons[c("missense_variant","start_lost","stop_gained"),]))

rownames(con_cons)[7]<-"total_synonomous"
rownames(con_cons)[8]<-"total_nonsynonomous"


#combining data
datplot1<-as.data.frame(t(con_cons[-c(1:6,8),]))
datplot2<-as.data.frame(t(con_cons[-c(1:7), ]))

datplot1$Condition<-rownames(datplot1)
datplot2$Condition<-rownames(datplot2)

datplot1$SNPtype<-"Synonymous"
datplot2$SNPtype<-"Nonsynonymous"

colnames(datplot1)[1]<-"SNPS"
colnames(datplot2)[1]<-"SNPS"

datplot1$Percent<-datplot1$SNPS/(datplot1$SNPS+datplot2$SNPS)
datplot2$Percent<-datplot2$SNPS/(datplot1$SNPS+datplot2$SNPS)

datplot<-rbind(datplot1, datplot2)
rm(datplot1,datplot2)

datplot_NS<-datplot[datplot$SNPtype != "Synonymous",]
datplot_NS$Condition<-as.factor(datplot_NS$Condition)
datplot_NS$Condition <- factor(datplot_NS$Condition, levels = c("expressed", "detreated", "deimmune","transcources","exnonDEimm"))

Section 2: Plotting

Figure3b<-ggplot(datplot_NS, aes(fill=Condition, y=Percent, x=Condition)) +
  geom_bar(position="dodge", stat="identity")+
  scale_fill_manual(values=c("#FF9E00", "#FF6457","#D445E4" ,"#CB66C2","#6600FF"))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
Figure3b

## Section 3: P-value calculation

Test : Comparing SNP proportions Text: “We found that DE immune genes have a significantly higher fraction of nonsynonymous sequence changes (24%) compared to the fat body detected genes (21%)… DE infected genes and trans source genes both showed lower fractions of nonsynonymous changes (20% for both sets)”

####proportions tests combos #####
 
p1<-c("expressed","expressed","expressed","deimmune")
p2<-c("detreated","deimmune","transcources","exnonDEimm")
combinations<-data.frame(p1, p2, stringsAsFactors = F)
rm(p1,p2)
combinations
##          p1           p2
## 1 expressed    detreated
## 2 expressed     deimmune
## 3 expressed transcources
## 4  deimmune   exnonDEimm
#Expressed vs DE infection
success<-c(19819,3610)
trails<-c(93963,17804)
test4ED<-prop.test(success,trails)
4*test4ED$p.value # Bonferoni correction ( we only do 4 tests)
## [1] 0.05832668
#Expresed vs DE Immune
success<-c(19819,604)
trails<-c(93963,2561)
test4EI<-prop.test(success,trails)
4*test4EI$p.value # Bonferoni correction ( we only do 4 tests)
## [1] 0.01003889
#Expresed vs Trans Sources 
success<-c(19819,46)
trails<-c(93963,226)
test4ET<-prop.test(success,trails)
4*test4ET$p.value # Bonferoni correction ( we only do 4 tests)
## [1] 3.396801
#Detected non-DE immune genes vs DE immune genes
success<-c(424,604)
trails<-c(2268,2561)
test4NI<-prop.test(success,trails)
4*test4NI$p.value # Bonferoni correction ( we only do 4 tests)
## [1] 0.0001599738

Section 4: Blossum scores

###########BLOSSUM scores#########
bloscores<-c(-4,-3,-2,-1,0,1,2,3) 
blossums<-data.frame(matrix(ncol = 0, nrow = 8))

for (i in 1:length(names)){
  totals<-integer()
  temp <- myfullfiles[[i]][10]
  for (ii in 1:8){
    totals<-append(totals, sum(temp$BLOSUM62==bloscores[ii]))
  }
  blossums<-cbind(blossums,totals)
  rm(totals,temp)
}
colnames(blossums)<-names
blossums<-cbind(blossums,bloscores)

for ( i in 1:(ncol(blossums)-1)){
  tempercent<-blossums[,i]/sum(blossums[,i])
  blossums<-cbind(blossums,tempercent)
  #colnames(blossums)[i+5]<-names[i]
}

colnames(blossums)<-c("deimmune","detreated","nondeimmune","expressed",
                      "transcources","bloscores","deimmunepercent",
                      "detreatedpercent","ndeimmunepercent","expressedpercent",
                      "transcourcespercent")

blossums$bloscores<-factor(blossums$bloscores, levels=c(-4,-3,-2,-1,0,1,2,3))
bloscounts<-blossums[,c("deimmune","detreated","nondeimmune","expressed",
                        "transcources","bloscores")]

blosper<-blossums[,c("bloscores","deimmunepercent",
                     "detreatedpercent","ndeimmunepercent", 
                     "expressedpercent","transcourcespercent")]

bloscounts<-melt(bloscounts,6)
colnames(bloscounts)<-c("BlossumScore", "Genes", "Percent")
blosper<-melt(blosper,1)
colnames(blosper)<-c("BlossumScore", "Genes", "Percent")

##without transcources
blospernot<-blosper[blosper$Genes!="transcourcespercent",]
blospernot$Genes <- factor(blospernot$Genes, levels=rev(levels(blospernot$Genes)))
blospernot$Genes <- factor(blospernot$Genes, levels = c("expressedpercent","detreatedpercent","deimmunepercent","ndeimmunepercent"))

Supplemental_Figure_S3<-ggplot(blospernot, aes(fill=Genes, y=Percent, x=BlossumScore)) +
  geom_bar(position="dodge",stat="identity")+
  scale_fill_manual(values=c("#FF9E00", "#FF6457","#D445E4","#6600FF"), labels = c("Fat Body Expressed", "DE Infection", "DE Immune", "Non-DE Immune"))+
  theme_bw()+
  ylab("Fraction")+
  ggtitle("Percent of SNPs in BLOSUM62 categories")+
  theme(plot.title = element_text(size=12),
        axis.title.x = element_text(size = 12), 
        axis.title.y = element_text(size = 12), 
        axis.text.x = element_text(size= 10), 
        axis.text.y = element_text(size= 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

Supplemental_Figure_S3

deimmune<-myfullfiles[[1]][10]
detreated<-myfullfiles[[2]][10]
expressed<-myfullfiles[[3]][10]
transcources<-myfullfiles[[4]][10]

deimmune<-as.integer(as.character(deimmune[!deimmune$BLOSUM62=="-",]))
detreated<-as.integer(as.character(detreated[!detreated$BLOSUM62=="-",]))
expressed<-as.integer(as.character(expressed[!expressed$BLOSUM62=="-",]))
transcources<-as.integer(as.character(transcources[!transcources$BLOSUM62=="-",]))

ks.test(expressed,detreated)
## Warning in ks.test(expressed, detreated): p-value will be approximate in
## the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  expressed and detreated
## D = 0.035717, p-value = 0.7107
## alternative hypothesis: two-sided
ks.test(expressed,deimmune)
## Warning in ks.test(expressed, deimmune): p-value will be approximate in the
## presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  expressed and deimmune
## D = 0.027899, p-value = 0.9896
## alternative hypothesis: two-sided