

################################################################################################
# Single locus TE analysis using DESeq2
################################################################################################

#DeSeq2 analysis 
install.packages("data.table")
library(data.table)
BiocManager::install("tidyr")
library('tidyr')
install.packages("dplyr")
library(dplyr)
install.packages("ggplot2")
library(ggplot2)

counts <- read.table("MM100_TE2_allcounts.txt", header = TRUE, fill=TRUE)
annotTE <- read.table("Rir_HGAP_ii_V2.fa.out.copy.txt", header = FALSE, fill=TRUE)

# Remove rows that have a * in the first column
annotTE <- annotTE[annotTE$V1 != "*", ]
#Renumber the TEs (GeneId) to remove ID duplicates (Same numbering as used to make the gff files we used for FeatureCounts)
annotTE$V15 <- seq.int(nrow(annotTE))

#Place reps and conds in order
counts$number = 1
names(counts)[7] <- "a" #0h
names(counts)[8] <- "b"
names(counts)[9] <- "c"
names(counts)[10] <- "d"
names(counts)[11] <- "e" #24 e
names(counts)[12] <- "f"
names(counts)[13] <- "g"
names(counts)[14] <- "h"
names(counts)[15] <- "i" #24 h
names(counts)[16] <- "j"
names(counts)[17] <- "k"
names(counts)[18] <- "l"
names(counts)[19] <- "m" #48 h
names(counts)[20] <- "n"
names(counts)[21] <- "o"
names(counts)[22] <- "p"
names(counts)[23] <- "q" #48 e
names(counts)[24] <- "r"
names(counts)[25] <- "s"
names(counts)[26] <- "t"
names(counts)[27] <- "u" #mays
names(counts)[28] <- "v"

#Reorganize dataframes (similar to bed files)
countTE <- subset(counts, select=c(Geneid, a:d, i:l, e:h, m:v))
annotTE <- subset(annotTE, select=c(V5, V6, V7, V10, V11, V15, V2))   #V2 is added to check the % divergence later
names(annotTE)[6] <- "Geneid"
names(annotTE)[7] <- "percdiv"
annotTE$length = abs(annotTE$V7-annotTE$V6)

# Merge both dataframes based on Geneid (adds details on what locus is what type of repeat).
final1 <- merge(countTE, annotTE, by="Geneid")
final1$length = abs(final1$V7-final1$V6)

#Make a count table with annotations
countdata <- subset(final1, select=c(Geneid, a:d, i:l, e:h, m:v, V5:V7, V10, V11, percdiv, length))

# I only look at KNOWN TEs, >100bp
erangedf <- countdata[rowSums(subset(countdata, select=c(a:v))>5)>2,]
erangedf_filt <- erangedf[erangedf$length > 100, ]

# Exclude Unknown TEs
# make groups for each TE category:e.g. all "Unknowns" will have a number 1, all "DNA/hAT will have number 2, etc
erangedf_filt$group <- NA

unknown <- dplyr::filter(erangedf_filt, grepl("Unknown", V11)) #n=12860
unknown$group = "Unknown"
simple_repeat <- dplyr::filter(erangedf_filt, grepl("Simple_repeat", V11)) #n=1655
simple_repeat$group = "Simple_repeat"
low_complexity <- dplyr::filter(erangedf_filt, grepl("Low_complexity", V11)) #n=13
low_complexity$group = "Low_complexity"
LTR <- dplyr::filter(erangedf_filt, grepl("LTR", V11)) #466
gypsy <- dplyr::filter(LTR, grepl("LTR/Gypsy", V11)) #n=267
gypsy$group = "LTR/Gypsy"
ltrother <- dplyr::filter(LTR, !grepl("LTR/Gypsy", V11)) #n=132
ltrother$group = "LTR/Other"
line <- dplyr::filter(erangedf_filt, grepl("LINE/", V11)) #n=303
line$group = "LINE"
hAT <- dplyr::filter(erangedf_filt, grepl("DNA/hAT", V11)) #n=375
hAT$group = "DNA/hAT"
tcMar <- dplyr::filter(erangedf_filt, grepl("DNA/Tc", V11)) #n=263
tcMar$group = "DNA/TcMar"
cmc <- dplyr::filter(erangedf_filt, grepl("DNA/CMC", V11)) #n=144
cmc$group = "DNA/CMC"
mule <- dplyr::filter(erangedf_filt, grepl("DNA/MULE", V11)) #n=255
mule$group = "DNA/MULE"
mav <- dplyr::filter(erangedf_filt, grepl("DNA/Maverick", V11)) #n=148
mav$group = "DNA/Maverick"
helitron <- dplyr::filter(erangedf_filt, grepl("Helitron", V11)) #n=111
helitron$group = "Helitron"

DNA <- dplyr::filter(erangedf_filt, grepl("DNA/", V11)) #216
DNAother <- dplyr::filter(DNA, (!grepl("DNA/CMC", V11) & (!grepl("DNA/hAT", V11)) & (!grepl("DNA/Maverick", V11)) & (!grepl("DNA/MULE", V11)) & (!grepl("DNA/Tc", V11)) & (!grepl("Helitron", V11)))) #183
DNAother$group = "DNA/Other"

#Group them together. Keep a special copy of the same df for later
erangedf_filt_merg2 <- rbind(gypsy, ltrother, line, hAT, tcMar, cmc, mule, mav, helitron, DNAother) #n=3177  TEs
erangedf_filt_merg2.1 <- rbind(gypsy, ltrother, line, hAT, tcMar, cmc, mule, mav, helitron, DNAother)




########### Apply a cutoff for expression: >0.1 RPKM in at least 10 samples ########### 

Formula
RPKM =   numReads / ( geneLength/1000 * totalNumReads/1,000,000 )

numReads - number of reads mapped to a gene sequence
geneLength - length of the gene sequence
totalNumReads - total number of mapped reads of a sample

lib_size <- colSums(subset(final1, select=c(a:t)))   ### lib_size = number of mapped reads
head(lib_size, n=10)
# RPKM : Normalize on length in kb (basepair*1000) - hence RPKM.
erangedf_filt_merg <- erangedf_filt_merg2  #28028 known elements in the table
erangedf_filt_merg_RPKM = (subset(erangedf_filt_merg, select=c(a:t)))/(lib_size/1000000)/(erangedf_filt_merg$length/1000)
erangedf_filt_merg_RPKM <- cbind(erangedf_filt_merg_RPKM, subset(erangedf_filt_merg, select=c(Geneid, V11, V5, V6, V7, percdiv, length, group)))
#Check how many TEs are > 1RPKM 
erangedf_filt_merg_filt1 <- erangedf_filt_merg_RPKM[rowSums(subset(erangedf_filt_merg_RPKM, select=c(a:t))>=1)>5,] 
######Rename the filtered dataframe to fit the rest of the analysis. Use RPKM >1 in >5 samples as a cutoff
erangedf_filt_merg2 <- erangedf_filt_merg_RPKM[rowSums(subset(erangedf_filt_merg_RPKM, select=c(a:t))>=1)>5,]

erangedf_filt_merg2$meanRPKM = rowMeans(subset(erangedf_filt_merg2, select=c(a:t)))
erangedf_filt_merg2$logmeanRPKM = log2(rowMeans(subset(erangedf_filt_merg2, select=c(a:t))))

#Plot meanRP_K_M by group
ggplot(data=erangedf_filt_merg2, mapping=aes(x=reorder(group, -meanRPKM), y=meanRPKM))+
  geom_boxplot(outlier.size=0.1)+
  geom_jitter(color="black", size=0.1, alpha=0.5) +
  theme_classic() +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4))

#Plot log2meanRP_K_M by group (FIGURE S2B)
ggplot(data=erangedf_filt_merg2, mapping=aes(x=reorder(group, -logmeanRPKM), y=logmeanRPKM))+
  geom_jitter(color="black", size=0.2, alpha=0.5, width=0.3) +
  geom_boxplot(width=0.7, alpha=0)+
  theme_classic() +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  theme(axis.text.y = element_text(size = 4))  +
  stat_summary(fun=median, geom="point", size=2, color="red")

pairwise.wilcox.test(erangedf_filt_merg2$logmeanRPKM, erangedf_filt_merg2$group,
                     p.adjust.method = "BH")
             DNA/CMC DNA/hAT DNA/Maverick DNA/MULE DNA/Other DNA/TcMar Helitron LINE    LTR/Gypsy
DNA/hAT      0.00910 -       -            -        -         -         -        -       -        
DNA/Maverick 0.29738 0.00027 -            -        -         -         -        -       -        
DNA/MULE     0.00057 0.18831 1.9e-06      -        -         -         -        -       -        
DNA/Other    0.60803 0.03094 0.07354      0.00070  -         -         -        -       -        
DNA/TcMar    0.17040 0.16685 0.00721      0.00503  0.44524   -         -        -       -        
Helitron     0.27509 0.00014 0.84501      6.9e-07  0.10243   0.01569   -        -       -        
LINE         0.18831 0.22087 0.01358      0.01358  0.48107   0.97299   0.02078  -       -        
LTR/Gypsy    0.01894 0.78237 0.00132      0.44524  0.05316   0.22087   0.00083  0.26131 -        
LTR/Other    0.22087 1.3e-05 0.98704      1.7e-07  0.05316   0.00280   0.90186  0.00721 0.00033 

library(FSA)
dunnTest(logmeanRPKM ~ group,
         data=erangedf_filt_merg2,
         method="bh")
            Comparison           Z      P.unadj        P.adj
1         DNA/CMC - DNA/hAT  2.91855951 3.516527e-03 1.054958e-02
2    DNA/CMC - DNA/Maverick -1.34619400 1.782400e-01 2.291657e-01
3    DNA/hAT - DNA/Maverick -4.14661334 3.374292e-05 2.169188e-04
4        DNA/CMC - DNA/MULE  4.04399191 5.254873e-05 2.955866e-04
5        DNA/hAT - DNA/MULE  1.64938882 9.906800e-02 1.651133e-01
6   DNA/Maverick - DNA/MULE  5.11778758 3.091405e-07 4.637107e-06
7       DNA/CMC - DNA/Other  0.62298112 5.332969e-01 5.999590e-01
8       DNA/hAT - DNA/Other -2.40317133 1.625357e-02 3.324594e-02
9  DNA/Maverick - DNA/Other  1.97607764 4.814598e-02 8.666277e-02
10     DNA/MULE - DNA/Other -3.63337034 2.797430e-04 1.144403e-03
11      DNA/CMC - DNA/TcMar  1.51320941 1.302265e-01 2.020756e-01
12      DNA/hAT - DNA/TcMar -1.56834041 1.168017e-01 1.877170e-01
13 DNA/Maverick - DNA/TcMar  2.84274727 4.472652e-03 1.257933e-02
14     DNA/MULE - DNA/TcMar -2.95621702 3.114379e-03 1.001050e-02
15    DNA/Other - DNA/TcMar  0.91339819 3.610332e-01 4.275393e-01
16       DNA/CMC - Helitron -1.23493458 2.168549e-01 2.637424e-01
17       DNA/hAT - Helitron -4.18793171 2.815082e-05 2.111311e-04
18  DNA/Maverick - Helitron  0.15452965 8.771921e-01 9.398487e-01
19      DNA/MULE - Helitron -5.19820340 2.012239e-07 4.527537e-06
20     DNA/Other - Helitron -1.89101218 5.862272e-02 1.014624e-01
21     DNA/TcMar - Helitron -2.80275282 5.066849e-03 1.266712e-02
22           DNA/CMC - LINE  1.48385525 1.378473e-01 2.067710e-01
23           DNA/hAT - LINE -1.48259319 1.381825e-01 2.005875e-01
24      DNA/Maverick - LINE  2.79012954 5.268696e-03 1.247849e-02
25          DNA/MULE - LINE -2.82725070 4.694956e-03 1.242782e-02
26         DNA/Other - LINE  0.89782911 3.692767e-01 4.260885e-01
27         DNA/TcMar - LINE  0.01471672 9.882582e-01 1.000000e+00
28          Helitron - LINE  2.74516559 6.048037e-03 1.360808e-02
29      DNA/CMC - LTR/Gypsy  2.71854972 6.556880e-03 1.405046e-02
30      DNA/hAT - LTR/Gypsy  0.12490765 9.005967e-01 9.424849e-01
31 DNA/Maverick - LTR/Gypsy  3.89193753 9.944685e-05 4.475108e-04
32     DNA/MULE - LTR/Gypsy -1.29608553 1.949460e-01 2.436825e-01
33    DNA/Other - LTR/Gypsy  2.22575409 2.603066e-02 5.092955e-02
34    DNA/TcMar - LTR/Gypsy  1.46514487 1.428814e-01 2.009270e-01
35     Helitron - LTR/Gypsy  3.89835651 9.684776e-05 4.842388e-04
36         LINE - LTR/Gypsy  1.40211720 1.608803e-01 2.193822e-01
37      DNA/CMC - LTR/Other -1.36440345 1.724406e-01 2.282302e-01
38      DNA/hAT - LTR/Other -4.88900110 1.013489e-06 1.140175e-05
39 DNA/Maverick - LTR/Other  0.17980510 8.573056e-01 9.409451e-01
40     DNA/MULE - LTR/Other -5.97336879 2.324037e-09 1.045817e-07
41    DNA/Other - LTR/Other -2.12259583 3.378773e-02 6.335200e-02
42    DNA/TcMar - LTR/Other -3.20476239 1.351740e-03 5.069026e-03
43     Helitron - LTR/Other  0.01111167 9.911344e-01 9.911344e-01
44         LINE - LTR/Other -3.11474390 1.841047e-03 6.372854e-03
45    LTR/Gypsy - LTR/Other -4.41027145 1.032411e-05 9.291701e-05

#Set rownames
rownames(erangedf_filt_merg2) <- erangedf_filt_merg2$Geneid 
#erangedf_filt_merg2 <- erangedf_filt_merg2[, -1]
#Remove last cols
erangedf_filt_merg3 <- subset(erangedf_filt_merg2, select=c(a:t))

colData <- read.table("sampleinfo_TE.tsv", header = TRUE, check.names = FALSE)

ddsFullCountTable <- DESeqDataSetFromMatrix(
  countData = round(erangedf_filt_merg3),
  colData = colData,
  design = ~ condition)

ddsFullCountTable
dds <- DESeq(ddsFullCountTable)

res <- results(dds, alpha=0.05)
res
summary(res)

#PCA PLOT
vsd <- vst(dds, blind=FALSE)
plotPCA(vsd, intgroup=c("condition", "replicate"))

rescol1 <- results(dds, contrast = c("condition", "mock24", "untreated"))
resdf <- data.frame(rescol1)
rescol2 <- results(dds, contrast = c("condition", "exudates24", "untreated"))
resdf2 <- data.frame(rescol2)
rescol3 <- results(dds, contrast = c("condition", "mock48", "untreated"))
resdf3 <- data.frame(rescol3)
rescol4 <- results(dds, contrast = c("condition", "exudates48", "untreated"))
resdf4 <- data.frame(rescol4)

#Make df with the meanExpr of all signDE TEs, at all timepoints: make new df with the basemean columns in resdf, resdf2, resdf3 and resdf4.
hmdf <- data.frame(rownames(resdf),resdf$log2FoldChange, resdf$padj, resdf2$log2FoldChange, resdf2$padj, resdf3$log2FoldChange, resdf3$padj, resdf4$log2FoldChange, resdf4$padj)
hmdf_sig <- hmdf[abs(hmdf$resdf.log2FoldChange & hmdf$resdf2.log2FoldChange & hmdf$resdf3.log2FoldChange & hmdf$resdf4.log2FoldChange) > 1,]
hmdf_sig <- hmdf[hmdf$resdf.padj & hmdf$resdf2.padj & hmdf$resdf3.padj & hmdf$resdf4.padj < 0.05,]
hmdf_sig <- hmdf_sig %>% drop_na()
hmdf_sig_final <- subset(hmdf_sig, select=c(rownames.resdf., resdf.log2FoldChange, resdf2.log2FoldChange, resdf3.log2FoldChange, resdf4.log2FoldChange))
hmdf_sig_final <- data.frame(hmdf_sig_final, row.names = 1)
#114 TE loci are significantly DE under these thresholds (100bp long, >5 norm reads in at least 3 samples, >1 log2FC)



################################################################################################
# Plot the divergence VS length for all expressed NON-GENIC TEs (Figure S2D)
################################################################################################
# Define color for each group
unique(nongenicExprTE[c("group")]) # 10 groups: 10 colours
library(RColorBrewer)

p <- ggplot(nongenicExprTE, aes(percdiv, log10(length), color=group)) + geom_point(shape=16, size=2, alpha=0.8)
p + xlab("% divergence")+
  ylab("log10(TE length(bp))")+
  xlim(0,45)+
  ylim(2,4)+
  theme_classic()+
  scale_colour_manual(values = rainbow(10))
  
# Define groups of expression kevel: custom pch sizes
logmeanRPKM range is -0.96 to 13.8
meanRPKM 0.51 to 14277
nongenicExprTE2 <- nongenicExprTE %>% 
  mutate(exprgroup = case_when(
    between(meanRPKM, 0, 4) ~ "1",
    between(meanRPKM, 4.001, 10) ~ "2",
    between(meanRPKM, 10.001, 25) ~ "3",
    between(meanRPKM, 25.001, 50) ~ "4",
    between(meanRPKM, 50.001, 14300) ~ "5"))

p <- ggplot(nongenicExprTE2, aes(percdiv, log10(length))) + 
  geom_point(aes(colour=group, size = log2(meanRPKM)), shape=16, alpha=0.6)+
  xlab("% divergence")+
  ylab("log10(TE length(bp))")+
  xlim(0,45)+
  ylim(2,4)+
  theme_classic()+
  scale_colour_manual(values = rainbow(10))
p

There are 2030 non-genic expressed TEs of >100bp and >1RPKM in at least 10 samples


################################################################################################
# plot the top TE/repeats categories expressed (FIGURE S2C)
################################################################################################

#Make groups
nongenicExprTE$group <- NA
LTR <- dplyr::filter(nongenicExprTE, grepl("LTR", V11)) #
gypsy <- dplyr::filter(LTR, grepl("LTR/Gypsy", V11)) 
gypsy$group = "LTR/Gypsy"
ltrother <- dplyr::filter(LTR, !grepl("LTR/Gypsy", V11)) 
ltrother$group = "LTR/Other"
line <- dplyr::filter(nongenicExprTE, grepl("LINE/", V11)) 
line$group = "LINE"
hAT <- dplyr::filter(nongenicExprTE, grepl("DNA/hAT", V11)) 
hAT$group = "DNA/hAT"
tcMar <- dplyr::filter(nongenicExprTE, grepl("DNA/Tc", V11)) 
tcMar$group = "DNA/TcMar"
cmc <- dplyr::filter(nongenicExprTE, grepl("DNA/CMC", V11)) 
cmc$group = "DNA/CMC"
mule <- dplyr::filter(nongenicExprTE, grepl("DNA/MULE", V11))
mule$group = "DNA/MULE"
mav <- dplyr::filter(nongenicExprTE, grepl("DNA/Maverick", V11)) 
mav$group = "DNA/Maverick"
helitron <- dplyr::filter(nongenicExprTE, grepl("Helitron", V11)) 
helitron$group = "Helitron"

DNA <- dplyr::filter(nongenicExprTE, grepl("DNA/", V11)) 
DNAother <- dplyr::filter(DNA, (!grepl("DNA/CMC", V11) & (!grepl("DNA/hAT", V11)) & (!grepl("DNA/Maverick", V11)) & (!grepl("DNA/MULE", V11)) & (!grepl("DNA/Tc", V11)) & (!grepl("Helitron", V11)))) #183
DNAother$group = "DNA/Other"

#Group all groups together
nongenicExprTE_mod <- rbind(gypsy, ltrother, line, hAT, tcMar, cmc, mule, mav, helitron, DNAother)

#Write the names of the TE classes that are EXPRESSED at all 
nongenicExprTE_mod$newnumber <- seq.int(nrow(nongenicExprTE_mod))
ZDT <- data.table(nongenicExprTE_mod)
Zcat <- ZDT[, .(number_of_distinct_orders = length(unique(newnumber))), by = V11]
Zcatord <- Zcat %>% arrange(desc(number_of_distinct_orders))
write.table(Zcatord, paste(out_dir,"expr_TE_categories_rework1RPKM.txt",sep="/"), col.names=T, quote=F, sep="\t", row.names=F)

TEcat <- read.table("expr_TE_categories_rework1RPKM.mod1.txt", header = TRUE, sep="\t", stringsAsFactors=FALSE, quote="")

# Normalize the numbers over the total number of TEs for each category:
TEcat$prop = (TEcat$number_of_distinct_orders/TEcat$numberingenome)*100

ggplot(data=TEcat, aes(x= reorder(V11, -number_of_distinct_orders), y = number_of_distinct_orders)) +
  geom_bar(stat="identity", width=0.8) +
  theme_minimal() +
  scale_x_discrete(limits = rev) +
  theme(axis.text.x = element_text(size = 10, angle = 90)) +
  coord_flip() +
  xlab("") + ylab(" Top expressed TE/repeat classes                  ") +
  ylim(0, 400) +
  theme_classic()

# Calculate the % divergence for non-genic expressed TEs in bins of 10% divergence (0-10, 11-20, 21-30, 31-40, 41-50) 
# Extract percdiv for each group
erangedf_filt_merg3 <- subset(nongenicExprTE_mod, select=c(percdiv, group))
#Make bins
erangedf_filt_merg3$bins <- cut(erangedf_filt_merg3$percdiv, breaks=c(0,10,20,30,40,50))

#Find unique values in bins for each TE category
divgypsy <- dplyr::filter(erangedf_filt_merg3, grepl("LTR/Gypsy", group))
unique(divgypsy[c("bins")])
table(divgypsy$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
19      99      67      11       0 

divcmc <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/CMC", group))
unique(divcmc[c("bins")])
table(divcmc$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
18      14      69       8       0 

divhat <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/hAT", group))
unique(divhat[c("bins")])
table(divhat$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
108     107     130      31       0 

divmule <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/MULE", group))
unique(divmule[c("bins")])
table(divmule$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
75     130      61      20       0 

divdnaother <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/Other", group))
unique(divdnaother[c("bins")])
table(divdnaother$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
25      33      98      30       0 

divmaverick <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/Maverick", group))
unique(divmaverick[c("bins")])
table(divmaverick$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
10      21      64      10       0 

divtcmar <- dplyr::filter(erangedf_filt_merg3, grepl("DNA/TcMar", group))
unique(divtcmar[c("bins")])
table(divtcmar$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
41     110      96      24       0 

divhelitron <- dplyr::filter(erangedf_filt_merg3, grepl("Helitron", group))
unique(divhelitron[c("bins")])
table(divhelitron$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
4      14      55      49       0 

divline <- dplyr::filter(erangedf_filt_merg3, grepl("LINE", group))
unique(divline[c("bins")])
table(divline$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
36      36     107      23       1 

divltrother <- dplyr::filter(erangedf_filt_merg3, grepl("LTR/Other", group))
unique(divltrother[c("bins")])
table(divltrother$bins)
(0,10] (10,20] (20,30] (30,40] (40,50] 
28      27      78       9       1 

# STACKED PLOT, FIGURE S2C
summary <- read.table("expr_TE_categories_RPKM1_mod4.txt", header = TRUE, sep="\t", stringsAsFactors=FALSE, quote="")

ggplot(summary, aes(fill=bin, y=number, x=reorder(V11, -number))) + 
  geom_bar(position="dodge", stat="identity")

ggplot(summary, aes(fill=bin, y=number, x=reorder(V11, -number))) + 
  geom_bar(position="stack", stat="identity")+
  theme_classic() +
  coord_flip()+
  scale_x_discrete(limits = rev)

#For % in in the plot:
summaryperc <- read.table("expr_TE_categories_rework1RPKM.mod1.txt", header = TRUE, sep="\t", stringsAsFactors=FALSE, quote="")
summaryperc$prop = summaryperc$number_of_distinct_orders/summaryperc$numberingenome*100  #These numbers went in Figure 1B


# Plot the lengths of genic and non-genic expressed TEs
# Non-genic = 2030 expressed
# Genic = 232
# 12.3 % of expressed TEs are found in genes (>100bp, >1RPKM)

# Size of genic TEs (only expressed ones)
ggplot(genicExprTE, aes(x=log2(length))) + geom_bar(color="black", fill="white", binwidth=25)+
  theme_classic()+
  xlab("Length of Expressed Genic TEs")+
  ylab("Number of Expressed Genic TEs")+
  ggtitle("Length of 809 genic expressed TEs")+
  theme(plot.title = element_text(hjust = 0.5))+
  xlim(6.5,15)+
  ylim(0,25)
xdensity <- ggplot(genicExprTE, aes(x=log2(length), fill="white")) + 
  geom_density(alpha=0.5) + 
  theme_classic()+
  ylim(0,0.8)+
  xlim(6.5, 15)
xdensity
#232 genic TEs expressed

# Size of genic TEs (including non-expressed)
filt.genicTEs <- dplyr::filter(genicTEs, !grepl("reverse_transcriptase|CENP-b|crinkler_family_protein|zinc_finger_mym|DNA_polymerase_delta_subunit_1|DNA_polymerase_epsilon|retrotransposon|integrase|pif1|zinc_finger_bed_domain|PIF1|ricesleeper|helicase-primase|helicase/primase|gag-pol|far1-related|ribonuclease_hi|ribonuclease_h|ATP_dependent_DNA_helicase|jockey|rve_super_family_integrase|transposase|transposable", V14, ignore.case=TRUE))
ggplot(filt.genicTEs, aes(x=V15)) + geom_histogram(color="black", fill="white", binwidth=25)+
  theme_classic()+
  xlab("Length of Expressed Genic TEs")+
  ylab("Number of Expressed Genic TEs")+
  ggtitle("786 expressed TEs are found in 
          Expressed Non-TE genes")+
  xlim(0,10000)+
  theme(plot.title = element_text(hjust = 0.5))

write.table(genicExprTE, paste(out_dir,"GenicexpressedTEs_rework1RPKM.txt",sep="/"), col.names=T, quote=F, sep="\t", row.names=F)
x <- read.table("nonGenicexpressedTEs_rework1RPKM.txt", header = TRUE, sep="\t", stringsAsFactors=FALSE, quote="")


ggplot(nongenicExprTE, aes(x=log2(length))) + geom_bar(color="red3", fill="red")+
  theme_classic()+
  xlab("Length of Expressed non-genic TEs")+
  ylab("Number of Expressed non-genic TEs")+
  ggtitle("log(length of non-genic expressed TEs)")+
  theme(plot.title = element_text(hjust = 0.5))+
  xlim(6.5,15)+
  ylim(0,25)
xdensity <- ggplot(nongenicExprTE, aes(x=log2(length), fill="white")) + 
  geom_density(alpha=0.5) + 
  theme_classic()+
  ylim(0,0.8)+
  xlim(6.5, 15)
xdensity
# 2030 nongenicExprTE


#STATS on the genic VS non-genic length distributions
df1 <- subset(nongenicExprTE, select=c(length))
df2 <- subset(genicExprTE, select=c(length))
df1$group = "1"
df2$group = "2"
stats <- rbind(df1, df2)

wilcox.test(length ~ group, data=stats) 
p-value < 2.2e-16
kruskal.test(length ~ group, data = stats)
p-value < 2.2e-16

#GO-term analysis on expressed genes containing TE fragments.
#From genic expressed TE list: get just the protein ID (string after ;Product=)
dfz <- genicExprTE
dfz$proteinID <- NA
dfz <- genicExprTE %>% dplyr::mutate(proteinID = regmatches(V14,gregexpr("(?<=;protein_id=).*",V14,perl=TRUE)))
#Protein Ids in the last column
dfz <- unique(subset(dfz, select=c(proteinID)))
dfz <- apply(dfz,2,as.character)

write.table(dfz, paste(out_dir,"proteinIDs_ofgeneswithExprTEs.tsv",sep="/"), row.names=F)
#Copy these IDs in the Uniprot tool, searching for EMBL/GenBank/DDBJ CDS identifiers. Then use the token gp__UBFA_DJiH_SV8 or https://biit.cs.ut.ee/gplink/l/uTy031lKRI to to a GO term analysis.


