
data <- read.table("/Users/alexandradallaire/TEtranscripts_outputs/spore_polyARNA2.cntTable",header=T,row.names=1)
groups <- factor(c(rep("TGroup",16),rep("CGroup",4)))
min_read <- 1
data <- data[apply(data,1,function(x){max(x)}) > min_read,]
library(DESeq, quietly=T)
cds <- newCountDataSet(data,groups)
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds,method="per-condition")
res <- nbinomTest(cds,"CGroup","TGroup")
res_fc <- res
resSig <- res_fc[(!is.na(res_fc$padj) & (res_fc$padj < 0.050000) & (abs(res_fc$log2FoldChange)> 0.000000)), ]

data_ord <- subset(data, select=c("A0_1Aligned.out.sam.C", "A0_2Aligned.out.sam.C", "A0_3Aligned.out.sam.C", "A0_4Aligned.out.sam.C", "A24h_1Aligned.out.sam.T", "A24h_2Aligned.out.sam.T", "A24h_3Aligned.out.sam.T", "A24h_4Aligned.out.sam.T", "A24e_1Aligned.out.sam.T", "A24e_2Aligned.out.sam.T", "A24e_3Aligned.out.sam.T", "A24e_4Aligned.out.sam.T", 
                                      "A48h_1Aligned.out.sam.T", "A48h_2Aligned.out.sam.T", "A48h_3Aligned.out.sam.T", "A48h_4Aligned.out.sam.T", "A48e_1Aligned.out.sam.T", "A48e_2Aligned.out.sam.T", "A48e_3Aligned.out.sam.T", "A48e_4Aligned.out.sam.T"))
colData <- read.table("sampleinfo_TEtranscripts.tsv", header = TRUE, check.names = FALSE)

ddsFullCountTable <- DESeqDataSetFromMatrix(
  countData = data_ord,
  colData = colData,
  design = ~ condition)
ddsFullCountTable
dds <- DESeq(ddsFullCountTable)
res <- results(dds, alpha=0.05)
res
summary(res)

results <- as.data.frame(res)
resSig <- res[(!is.na(res$pvalue) & (res$pvalue < 0.050000) & (abs(res$log2FoldChange)> 0.5)), ]
vsd <- vst(dds, blind=FALSE)
plotPCA(vsd, intgroup=c("condition", "replicate"))+ theme_classic()


################################################################################################
# plot the top TE/repeats categories expressed (FIGURE 1B)
################################################################################################
#Split the family and class into new columns
library(stringr)
res_p <- read.table("spore_polyARNA_gene_TE_DESEQ2analysis.txt", header = TRUE, check.names = FALSE)
res_p$label <- row.names(res_p)
res_psplit <- data.frame(str_split_fixed(res_p$label, ":", 3))
res_pmerge <- cbind(res_p, res_psplit)
res_p <- res_pmerge[res_pmerge$baseMean > 100, ]

HEL <- dplyr::filter(res_p, grepl("helitron", X2, ignore.case=TRUE)) #n=36
DNA <- dplyr::filter(res_p, grepl("dna", X3, ignore.case=TRUE)) #n=415
LTR <- dplyr::filter(res_p, grepl("ltr", X3, ignore.case=TRUE)) #n=420
LINE <- dplyr::filter(res_p, grepl("line", X3, ignore.case=TRUE)) #n=171

#Find top DNA sub-families that are EXPRESSED
ZDT <- data.table(DNA)
Zcat <- ZDT[, .(number_of_distinct_orders = length(unique(X1))), by = X2]
Zcatord <- Zcat %>% arrange(desc(number_of_distinct_orders))

#Find top LTR sub-families that are EXPRESSED
ZDT2 <- data.table(LTR)
Zcat2 <- ZDT2[, .(number_of_distinct_orders = length(unique(X1))), by = X2]
Zcatord2 <- Zcat2 %>% arrange(desc(number_of_distinct_orders))

#Find top LINE sub-families that are EXPRESSED
ZDT3 <- data.table(LINE)
Zcat3 <- ZDT3[, .(number_of_distinct_orders = length(unique(X1))), by = X2]
Zcatord3 <- Zcat3 %>% arrange(desc(number_of_distinct_orders))

df1 <- data.frame(Zcatord[1:20,])
df1$order <- seq.int(nrow(df1))
bp <- ggplot(df1, aes(x="", y=reorder(number_of_distinct_orders, -order), fill=X2, label = X2))+
  geom_bar(width = 1, stat = "identity") +
  theme_classic()+
  geom_text(size = 3, position = position_stack(vjust = 0.5))+
  theme(legend.position="none")
bp

df2 <- data.frame(Zcatord2[1:10,])
df2$order <- seq.int(nrow(df2))
bp <- ggplot(df2, aes(x="", y=reorder(number_of_distinct_orders, -order), fill=X2, label = X2))+
  geom_bar(width = 1, stat = "identity") +
  theme_classic()+
  geom_text(size = 3, position = position_stack(vjust = 0.5))+
  theme(legend.position="none")
bp

df3 <- data.frame(Zcatord3[1:20,])
df3$order <- seq.int(nrow(df3))
bp <- ggplot(df3, aes(x="", y=reorder(number_of_distinct_orders, -order), fill=X2, label = X2))+
  geom_bar(width = 1, stat = "identity") +
  theme_classic()+
  geom_text(size = 3, position = position_stack(vjust = 0.5))+
  theme(legend.position="none")
bp

df <- data.frame(superfamily=c("Helitron", "DNA", "LTR", "LINE"),
                 number=c(36, 415, 420, 171))
p<-ggplot(data=df, aes(x=superfamily, y=number)) +
  geom_bar(stat="identity")+
  theme_classic()
p



counts <- read.table("spore_polyARNA2.cntTable", header = TRUE, check.names = FALSE)
sigdiff <- read.table("spore_polyARNA2_sigdiff_gene_TE.txt", header = TRUE, check.names = FALSE)
expr <- read.table("spore_polyARNA2_gene_TE_analysis.txt", header = TRUE, check.names = FALSE)

min_read <- 1
rownames(counts) <- counts$"gene/TE"
counts <- counts[, -1]
counts <- counts[apply(counts,1,function(x){max(x)}) > min_read,]





#PCA PLOT (FIGURE 1D)
#Reorder columns
counts_ord <- subset(counts, select=c("A0_1Aligned.out.sam.C", "A0_2Aligned.out.sam.C", "A0_3Aligned.out.sam.C", "A0_4Aligned.out.sam.C", "A24h_1Aligned.out.sam.T", "A24h_2Aligned.out.sam.T", "A24h_3Aligned.out.sam.T", "A24h_4Aligned.out.sam.T", "A24e_1Aligned.out.sam.T", "A24e_2Aligned.out.sam.T", "A24e_3Aligned.out.sam.T", "A24e_4Aligned.out.sam.T", 
                                   "A48h_1Aligned.out.sam.T", "A48h_2Aligned.out.sam.T", "A48h_3Aligned.out.sam.T", "A48h_4Aligned.out.sam.T", "A48e_1Aligned.out.sam.T", "A48e_2Aligned.out.sam.T", "A48e_3Aligned.out.sam.T", "A48e_4Aligned.out.sam.T"))
#Transpose
pcaData = as.data.frame(t(counts_ord))
#Add type of experiment
pcaData["type"] = c(rep("Control",4), rep("24mock",4), rep("24exu",4), rep("48mock",4), rep("48exu",4))
#library("ggfortify")
autoplot(prcomp(pcaData[,1:3165]), 
         data = pcaData, 
         colour = 'type',
         label = TRUE, 
         label.size = 0) +
  theme_classic()


#HEATMAP (FIGURE 1C)
#Extract normalised counts
dds <- estimateSizeFactors(dds)
normcounts <- data.frame(counts(dds, normalized=TRUE))
normcounts$rownames.resdf. <- row.names(normcounts)

rescol1 <- results(dds, contrast = c("condition", "mock24", "untreated"))
resdf <- data.frame(rescol1)
rescol2 <- results(dds, contrast = c("condition", "exudates24", "untreated"))
resdf2 <- data.frame(rescol2)
rescol3 <- results(dds, contrast = c("condition", "mock48", "untreated"))
resdf3 <- data.frame(rescol3)
rescol4 <- results(dds, contrast = c("condition", "exudates48", "untreated"))
resdf4 <- data.frame(rescol4)

# Make a dataframe with significant DE genes
hmdf <- data.frame(rownames(resdf), resdf$baseMean, resdf$log2FoldChange, resdf$padj, resdf2$baseMean, resdf2$log2FoldChange, resdf2$padj, resdf3$baseMean,resdf3$log2FoldChange, resdf3$padj, resdf4$baseMean, resdf4$log2FoldChange, resdf4$padj)
hmdf_sig <- hmdf[abs(hmdf$resdf.log2FoldChange) > 0.5 & abs(hmdf$resdf2.log2FoldChange) > 0.5 & abs(hmdf$resdf3.log2FoldChange) > 0.5 & abs(hmdf$resdf4.log2FoldChange) > 0.5,]
hmdf_sig2 <- hmdf_sig[hmdf_sig$resdf.padj < 0.05 & hmdf_sig$resdf2.padj < 0.05 & hmdf_sig$resdf3.padj < 0.05 & hmdf_sig$resdf4.padj < 0.05,]
hmdf_sig2 <- hmdf_sig2 %>% drop_na()

merge <- join(hmdf_sig2, normcounts, by=c("rownames.resdf."))

hmdf_sig_final <- subset(merge, select=c(rownames.resdf., resdf.log2FoldChange, resdf2.log2FoldChange, resdf3.log2FoldChange, resdf4.log2FoldChange))
hmdf_sig_final <- data.frame(hmdf_sig_final, row.names = 1)

library("pheatmap")
library("RColorBrewer")
breaksList = seq(-4, 4, by = 0.05)
pheatmap(hmdf_sig_final, 
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(length(breaksList)), 
         breaks = breaksList,
         cluster_rows = T, 
         clustering_method = "complete",
         cluster_cols = F, 
         show_rownames=T, 
         border_color=NA, 
         fontsize = 5, 
         scale="none",      
         fontsize_row = 5, 
         height=50, 
         cellwidth = 10, 
         annotation_width = 50)




