library("tximport")
library("readr")
library("tximportData")
library("dplyr")
library("DESeq2")
library("ComplexHeatmap")
library("ggplot2")
library("statmod")
library("pheatmap")
library("ggrepel")
library("tidyverse")
library("fgsea")



files_to_read <- list.files(path = "/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WTS/08_A3B_A3A",pattern = "genes.results$",full.names = T)

names(files_to_read)<-gsub(".genes.results","",basename(files_to_read))




gtf_df<-read.csv("/home/users/ayh/Projects/reference/RSEM/test.2.gtf",
                 header = F,
                 sep="\t")
gtf_ss_df<-gtf_df%>%select(V10,V14)%>%mutate(id=paste(V10,V14,sep="_"))
names(files_to_read)<-gsub(".genes.results","",basename(files_to_read))

metadata<-read_tsv("/home/users/ayh/Projects/27_A3B/03_sequencing/single_clone_sequencing/WTS/08_A3B_A3A/metadata.v2.txt")
metadata<-metadata%>%as.data.frame
rownames(metadata)<-metadata$id

#metadata<-metadata[,2:4]

###1. A3A top 500 gene  ##########
##################################


i="48h"
cond="3ug"
sample="A3A"
fac1="0h"
fac2=i
vol_meta<-metadata[metadata$dose==cond|metadata$time=="0h",]
vol_meta<-vol_meta[vol_meta$type==sample,]
vol_meta<-vol_meta[vol_meta$time==fac2 | vol_meta$time==fac1,]
vol_meta<-vol_meta[grepl("C3",vol_meta$id)&vol_meta$id!="A3A_1st_C3_48h_3ug",]
vol_files_to_read<-files_to_read[names(files_to_read) %in% rownames(vol_meta)]

vol_txi.rsem<-tximport(vol_files_to_read,type="rsem",txIn=FALSE,txOut=FALSE)

vol_txi.rsem$abundance<-vol_txi.rsem$abundance[rownames(vol_txi.rsem$abundance)%in%gtf_ss_df$id,]
vol_txi.rsem$counts<-vol_txi.rsem$counts[rownames(vol_txi.rsem$counts)%in%gtf_ss_df$id,]
vol_txi.rsem$length<-vol_txi.rsem$length[rownames(vol_txi.rsem$length)%in%gtf_ss_df$id,]

vol_txi.rsem$length[vol_txi.rsem$length == 0] <-1

rownames(vol_txi.rsem$abundance)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$abundance))
rownames(vol_txi.rsem$counts)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$counts))
rownames(vol_txi.rsem$length)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$length))

vol_dds<-DESeqDataSetFromTximport(vol_txi.rsem,
                                  colData=vol_meta,
                                  design=~time)

vol_dds<-DESeq(vol_dds)

lib.size<-estimateSizeFactorsForMatrix(vol_txi.rsem$counts)
ed<-t(t(vol_txi.rsem$counts)/lib.size)

means<-rowMeans(ed)

vars<-apply(ed,1,var)###########
cv2<-vars/means^2


smoothScatter(log(means),log(cv2))

minMeanForFit <- unname( quantile( means[ which( cv2 > .3 ) ], .95 ) )
useForFit <- means >= minMeanForFit # & spikeins
fit <- glmgam.fit( cbind( a0 = 1, a1tilde = 1/means[useForFit] ),cv2[useForFit] )
a0 <- unname( fit$coefficients["a0"] )
a1 <- unname( fit$coefficients["a1tilde"])
fit$coefficients
dev.off()
par(mar=c(3.5,3.5,1,1),mgp=c(2,0.65,0),cex=0.9); smoothScatter(log(means),log(cv2));
xg <- exp(seq( min(log(means[means>0])), max(log(means)), length.out=1000 ))
vfit <- a1/xg + a0
# add fit line
lines( log(xg), log(vfit), col="black", lwd=3 )
df <- ncol(ed) - 1
# add confidence interval
lines(log(xg),log(vfit * qchisq(0.975,df)/df),lty=2,col="black")
lines(log(xg),log(vfit * qchisq(0.025,df)/df),lty=2,col="black")

afit <- a1/means+a0
varFitRatio <- vars/(afit*means^2)
#varFitRatio
varorder <- order(varFitRatio,decreasing=T)
oed <- ed[varorder,]
oed
points(log(means[varorder[1:100]]),log(cv2[varorder[1:100]]),col=2)

pval <- pchisq(varFitRatio*df,df=df,lower.tail=F)
adj.pval <- p.adjust(pval,"fdr")
sigVariedGenes <- adj.pval<0.05;
table(sigVariedGenes)
pval
sig_gene_df<-data.frame("gene_id"=names(adj.pval),"adj.pval"=adj.pval)%>%filter(adj.pval<0.05)%>%arrange(adj.pval)%>%as.tibble()
sig_gene_top500_df<-sig_gene_df%>%head(500)
table(sigVariedGenes)
sigVariedGenes
oed
#sigVariedGenes[sigVariedGenes]
m<-oed[1:500,]
m
m%>%as.data.frame()
write.table(sig_gene_top500_df,"/home/users/ayh/Projects/27_A3B/06_Figure_code/top500_A3A.3ug.txt",
            sep="\t",
            quote = F,
            row.names=F)


top500_df<-results(vol_dds)%>%as.tibble()%>%mutate("gene_id"=rownames(results(vol_dds)))%>%filter(padj<0.05)%>%filter(gene_id%in%sig_gene_df$gene_id)%>%arrange(padj)%>%head(500)
a_top500_edit_df<-results(vol_dds)%>%as.tibble()%>%mutate("gene_id"=rownames(results(vol_dds)))%>%filter(abs(log2FoldChange)>1)%>%filter(padj<0.05)%>%filter(gene_id%in%sig_gene_df$gene_id)%>%arrange(padj)%>%head(500)
a_vol_df<-results(vol_dds)%>%as.tibble()%>%mutate("gene_id"=rownames(results(vol_dds)))%>%mutate(logp=-log10(padj))%>%mutate(logp=ifelse(logp==Inf,300,logp))%>%mutate(logp=ifelse(logp>300,300,logp))

a_vol_df<-a_vol_df%>%mutate(dir=ifelse(log2FoldChange>1&logp>2,"increase",
                                       ifelse(log2FoldChange<(-1)&logp>2,"decrease",NA)))
a_anno_gene=c("APOBEC3A","CDKN1A","CDC20","GADD45A","IL8","CXCL1","CCL20","MX1","IFIT1","IFIT3")
a_anno_gene=c("APOBEC3A","CDKN1A","CDC20","GADD45A","IL8","CCL20","IFIT1","IFIT3")

driver_gene_df<-read_tsv("/home/users/ayh/Projects/TCGA_cancer_census_gene.txt")
driver_gene_df<-driver_gene_df%>%plyr::rename(c("gene"="gene_id"))
a_vol_df<-left_join(a_vol_df,driver_gene_df)


b_vol_df<-left_join(b_vol_df,driver_gene_df)

a_vol_anno_gene=c("APOBEC3A","CDKN1A","CDC20","GADD45A","IL8","CCL20","MDM2","PLK1","SAPCD2","NDRG1","ATF3","CA9","EGLN3","REG1A","BTG2","CCND1","GADD153","DDIT4","BBC3")
#a_vol_anno_gene=c("MGMT")

a_anno_gene=c("APOBEC3A","CDKN1A","CDC20","GADD45A","IL8","CCL20","IFIT1","IFIT3","PLK1")
a_anno_gene2=c("AREG", "FAM72C","FAM72D", "EREG", "CCNB1", "ANO1", "FGFBP1", "HIST1H3B", "PLK1", "CDC20", "KIF14",
               "CASP14", "EGLN3", "TREM1", "CA9", "ANGPTL4", "SERPING1", "ADM", "LOX", "SLC2A3", "SERPINE1",
               "ABCA12","REG1A","ABCA12","ATF3")


atf_gene<-c("BCAS3","NOS3","BNIP3L","HSP90AB1","HSP8","SERPINE1","SCN3B","BCL2L14","APAF1","GPR39","DUSP5","CBLC","DNMT1","VDR","CALD1","ANK1","PPFIBP1","LGALS3")
a_anno_fin_gene<-c("APOBEC3A","CA9","ATF3","ABCA12","REG1A","NDRG1","EGLN3","CDKN1A","ANO1","PLK1","CDC20","CCNB1","CCL20","IL8","BBC3","GADD45A")
atf_gene<-c("NOS3","BNIP3L","SERPINE1","DDIT4","NDRG1","DDB2","GADD45A","FDXR","CCNG2","PRDM1","BBC3","ANK1")

a_vol2<-ggplot(a_vol_df,aes(x=log2FoldChange,y=logp,col=dir))+
  geom_point(size=3)+
  geom_vline(xintercept=1,linetype="dashed",colour="black")+
  geom_vline(xintercept=-1,linetype="dashed",colour="black")+
  geom_hline(yintercept=2,linetype="dashed",colour="black")+
  theme_classic()+
  theme(axis.text=element_text(size=30),
        axis.title=element_text(size=40),
        legend.title=element_blank(),
        legend.text=element_text(size=30),
        axis.ticks.y=element_line(size=3),
        axis.ticks.x=element_line(size=2),
        axis.ticks.length=unit(.25,"cm"),
        #        legend.position="top"
  )+
  scale_x_continuous(limit=c(-10,10))+
  scale_colour_manual(values=c("#4db7f7","#c54242"),limits=c("decrease","increase"))+
  #geom_label_repel(aes(label=ifelse((gene_id%in%a_anno_gene|logp>150)&abs(log2FoldChange)>1.5,gene_id,"")),
  geom_label_repel(aes(label=ifelse(gene_id%in%atf_gene|gene_id%in%a_anno_fin_gene,gene_id,"")),
                   max.overlaps=10000, min.segment.length = 0.1,vjust=1,
                   show.legend=F,
                   segment.color = 'grey50',
                   size=5,
                   
  )+
  scale_y_continuous(limit=c(0,350))+
  guides(colour = guide_legend(override.aes = list(size=7)))+
  ylab(expression(-log[10]*"(adj.Pval)"))+
  xlab(expression(log[2]*"FoldChange"))


pdf("/home/users/ayh/Projects/27_A3B/06_Figure_code/Fig1/volcano.A3A.v2.pdf",width=15, height=10
)
#ggplot()
a_vol2
dev.off()

a_vol_df%>%
  write.table("/home/users/ayh/Projects/27_A3B/06_Figure_code/Fig1/A3A_DEseq2.txt",
              sep="\t",
              quote=F,
              row.names=F)

mat_test<-mat_test[rownames(mat_test)%in%(top500_df$gene_id),]

gene_order<-mat_test%>%as.data.frame()%>%as.tibble()%>%mutate(gene=rownames(mat_test))%>%gather(id,score,1:6)%>%mutate(group=ifelse(grepl("Ctrl",id),"0h","48h"))%>%group_by(gene,group)%>%dplyr::summarise(mean_score=median(score))%>%ungroup()%>%filter(group=="48h")%>%arrange(-mean_score)
#gene_order%>%mutate(order=1:498)%>%filter(gene=="APOBEC3B")


#total assay for plotting

vol_meta<-metadata[metadata$dose==cond ,]
vol_meta<-vol_meta[vol_meta$type==sample,]
vol_meta<-vol_meta[vol_meta$time==fac2 | vol_meta$time==fac1,]
vol_meta
vol_files_to_read<-files_to_read[names(files_to_read) %in% rownames(vol_meta)]
vol_files_to_read
vol_txi.rsem<-tximport(vol_files_to_read,type="rsem",txIn=FALSE,txOut=FALSE)

vol_txi.rsem$abundance<-vol_txi.rsem$abundance[rownames(vol_txi.rsem$abundance)%in%gtf_ss_df$id,]
vol_txi.rsem$counts<-vol_txi.rsem$counts[rownames(vol_txi.rsem$counts)%in%gtf_ss_df$id,]
vol_txi.rsem$length<-vol_txi.rsem$length[rownames(vol_txi.rsem$length)%in%gtf_ss_df$id,]
rownames((vol_txi.rsem$abundance))
vol_txi.rsem$length[vol_txi.rsem$length == 0] <-1
#idx<-rowSu
#vol_txi.rsem$length[,which(vol_txi.rsem$length == 0)]
rownames(vol_txi.rsem$abundance)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$abundance))
rownames(vol_txi.rsem$counts)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$counts))
rownames(vol_txi.rsem$length)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$length))

vol_dds<-DESeqDataSetFromTximport(vol_txi.rsem,
                                  colData=vol_meta,
                                  design=~time)

vol_dds<-DESeq(vol_dds)
results(vol_dds)%>%as.tibble()%>%mutate("gene_id"=rownames(results(vol_dds)))%>%filter(padj<0.05)
mat_test<-t(scale(t(assay(rlog(vol_dds)))))

mat_test<-mat_test[rownames(mat_test)%in%(top500_df$gene_id),]

anno_gene=c("APOBEC3A","CDKN1A","CXCL1","PTGS2","CXCL2","CXCL5","MYC","CCL20","IL1R1","CDKN2B","MDM2","CDC20")

#### A3B#########
##################



###1. A3B top 500 gene  ##########
##################################


i="48h"

cond="3ug"
sample="A3B"

fac1="0h"
fac2=i
vol_meta<-metadata[metadata$dose==cond|metadata$time=="0h",]
vol_meta<-vol_meta[vol_meta$type==sample,]
vol_meta<-vol_meta[vol_meta$time==fac2 | vol_meta$time==fac1,]

vol_files_to_read<-files_to_read[names(files_to_read) %in% rownames(vol_meta)]

vol_txi.rsem<-tximport(vol_files_to_read,type="rsem",txIn=FALSE,txOut=FALSE)

vol_txi.rsem$abundance<-vol_txi.rsem$abundance[rownames(vol_txi.rsem$abundance)%in%gtf_ss_df$id,]
vol_txi.rsem$counts<-vol_txi.rsem$counts[rownames(vol_txi.rsem$counts)%in%gtf_ss_df$id,]
vol_txi.rsem$length<-vol_txi.rsem$length[rownames(vol_txi.rsem$length)%in%gtf_ss_df$id,]

vol_txi.rsem$length[vol_txi.rsem$length == 0] <-1

rownames(vol_txi.rsem$abundance)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$abundance))
rownames(vol_txi.rsem$counts)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$counts))
rownames(vol_txi.rsem$length)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$length))

vol_dds<-DESeqDataSetFromTximport(vol_txi.rsem,
                                  colData=vol_meta,
                                  design=~time)

vol_dds<-DESeq(vol_dds)


lib.size<-estimateSizeFactorsForMatrix(vol_txi.rsem$counts)
ed<-t(t(vol_txi.rsem$counts)/lib.size)

means<-rowMeans(ed)

vars<-apply(ed,1,var)###########
cv2<-vars/means^2

smoothScatter(log(means),log(cv2))

minMeanForFit <- unname( quantile( means[ which( cv2 > .3 ) ], .95 ) )
useForFit <- means >= minMeanForFit # & spikeins
fit <- glmgam.fit( cbind( a0 = 1, a1tilde = 1/means[useForFit] ),cv2[useForFit] )
a0 <- unname( fit$coefficients["a0"] )
a1 <- unname( fit$coefficients["a1tilde"])

par(mar=c(3.5,3.5,1,1),mgp=c(2,0.65,0),cex=0.9); smoothScatter(log(means),log(cv2));
xg <- exp(seq( min(log(means[means>0])), max(log(means)), length.out=1000 ))
vfit <- a1/xg + a0
# add fit line
lines( log(xg), log(vfit), col="black", lwd=3 )
df <- ncol(ed) - 1
# add confidence interval
lines(log(xg),log(vfit * qchisq(0.975,df)/df),lty=2,col="black")
lines(log(xg),log(vfit * qchisq(0.025,df)/df),lty=2,col="black")

afit <- a1/means+a0
varFitRatio <- vars/(afit*means^2)

varorder <- order(varFitRatio,decreasing=T)
oed <- ed[varorder,]

points(log(means[varorder[1:500]]),log(cv2[varorder[1:500]]),col=2)

pval <- pchisq(varFitRatio*df,df=df,lower.tail=F)
adj.pval <- p.adjust(pval,"fdr")
sigVariedGenes <- adj.pval<0.001;

sig_gene_df<-data.frame("gene_id"=names(adj.pval),"adj.pval"=adj.pval)%>%filter(adj.pval<0.05)%>%arrange(adj.pval)

sig_gene_top500_df<-sig_gene_df%>%head(500)


m<-oed[1:500,]

apply(m,1,max)

m%>%as.data.frame()
write.table(sig_gene_top500_df,"/home/users/ayh/Projects/27_A3B/06_Figure_code/top500_A3B.3ug.txt",
            sep="\t",
            quote = F,
            row.names=F)

m_a<-read_tsv("/home/users/ayh/Projects/27_A3B/06_Figure_code/top500_A3B.txt")


mat_test<-t(scale(t(assay(vol_dds))))

mat_test[rownames(mat_test)=="APOBEC3B",]


top500_df<-results(vol_dds)%>%as.tibble()%>%mutate("gene_id"=rownames(results(vol_dds)))%>%filter(padj<0.05)%>%filter(gene_id%in%sig_gene_df$gene_id)%>%arrange(padj)%>%head(500)
b_top500_edit_df<-results(vol_dds)%>%as.tibble()%>%mutate("gene_id"=rownames(results(vol_dds)))%>%filter(abs(log2FoldChange)>1)%>%filter(padj<0.05)%>%filter(gene_id%in%sig_gene_df$gene_id)%>%arrange(padj)%>%head(500)

mat_test<-mat_test[rownames(mat_test)%in%(top500_df$gene_id),]

gene_order<-mat_test%>%as.data.frame()%>%as.tibble()%>%mutate(gene=rownames(mat_test))%>%gather(id,score,1:6)%>%mutate(group=ifelse(grepl("0h",id),"0h","48h"))%>%group_by(gene,group)%>%dplyr::summarise(mean_score=median(score))%>%ungroup()%>%filter(group=="48h")%>%arrange(-mean_score)

#total assay for plotting

vol_meta<-metadata[metadata$dose==cond ,]
vol_meta<-vol_meta[vol_meta$type==sample,]
vol_meta<-vol_meta[vol_meta$time==fac2 | vol_meta$time==fac1,]

vol_files_to_read<-files_to_read[names(files_to_read) %in% rownames(vol_meta)]

vol_txi.rsem<-tximport(vol_files_to_read,type="rsem",txIn=FALSE,txOut=FALSE)

vol_txi.rsem$abundance<-vol_txi.rsem$abundance[rownames(vol_txi.rsem$abundance)%in%gtf_ss_df$id,]
vol_txi.rsem$counts<-vol_txi.rsem$counts[rownames(vol_txi.rsem$counts)%in%gtf_ss_df$id,]
vol_txi.rsem$length<-vol_txi.rsem$length[rownames(vol_txi.rsem$length)%in%gtf_ss_df$id,]

vol_txi.rsem$length[vol_txi.rsem$length == 0] <-1

rownames(vol_txi.rsem$abundance)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$abundance))
rownames(vol_txi.rsem$counts)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$counts))
rownames(vol_txi.rsem$length)<-gsub("ENSG[0-9]*_","",rownames(vol_txi.rsem$length))

vol_dds<-DESeqDataSetFromTximport(vol_txi.rsem,
                                  colData=vol_meta,
                                  design=~time)

vol_dds<-DESeq(vol_dds)


mat_test<-t(scale(t(assay(rlog(vol_dds)))))

mat_test<-mat_test[rownames(mat_test)%in%(top500_df$gene_id),]

b_anno_gene<-c("APOBEC3B","CDKN1A","GADD45A","IL8","CXCL1","CCL20","MX1","IFIT1","IFIT3",)



b_vol_df<-results(vol_dds)%>%as.tibble()%>%mutate("gene_id"=rownames(results(vol_dds)))%>%mutate(logp=-log10(padj))%>%mutate(logp=ifelse(logp==Inf,300,logp))%>%mutate(logp=ifelse(logp>300,300,logp))

b_vol_df<-b_vol_df%>%mutate(dir=ifelse(log2FoldChange>1&logp>2,"increase",
                                       ifelse(log2FoldChange<(-1)&logp>2,"decrease",NA)))
anno_gene<-c("APOBEC3B","CXCL6","CCL20","TNFAIP3","CXCL1","CXCL3","PTGS2","CXCL2","CXCL5","IL17RB","MUC5AC","NFKBIA","LCN2","CCL2","FOSB","CDKN1A")
b_vol_anno_gene<-c("APOBEC3B","YARS","CCL20","CDKN1A","IL8","TNFAIP2","CXCL2","ADRA2A","ICAM1","MUC13","ATF3","TGM2","ANXA10")

b_vol<-ggplot(b_vol_df,aes(x=log2FoldChange,y=logp,col=dir))+
  geom_point(size=3)+
  geom_vline(xintercept=1,linetype="dashed",colour="black")+
  geom_vline(xintercept=-1,linetype="dashed",colour="black")+
  geom_hline(yintercept=2,linetype="dashed",colour="black")+
  theme_classic()+
  theme(axis.text=element_text(size=30),
        axis.title=element_text(size=40),
        legend.title=element_blank(),
        legend.text=element_text(size=30),
        axis.ticks.y=element_line(size=3),
        axis.ticks.x=element_line(size=2),
        axis.ticks.length=unit(.25,"cm"),
        #        legend.position="top"
  )+
  scale_x_continuous(limit=c(-11,11))+
  scale_colour_manual(values=c("#4db7f7","#c54242"),limits=c("decrease","increase"))+
  #geom_label_repel(aes(label=ifelse(logp>150&(abs(log2FoldChange)>1.5),gene_id,"")),
  geom_label_repel(aes(label=ifelse(gene_id%in%b_vol_anno_gene,gene_id,"")),
                   max.overlaps=1000, min.segment.length = 0.1,vjust=1,
                   show.legend=F,
                   segment.color = 'grey50',
                   size=5
  )+
  scale_y_continuous(limit=c(0,350))+
  guides(colour = guide_legend(override.aes = list(size=7)))+
  ylab(expression(-log[10]*"(adj.Pval)"))+
  xlab(expression(log[2]*"FoldChange"))




pdf("/home/users/ayh/Projects/27_A3B/06_Figure_code/Fig1/volcano.A3B.pdf",width=15, height=10
)
#ggplot()
b_vol
dev.off()

