Load Libraries

library("readxl")
library("tidyverse") 
library("edgeR")        
library("clusterProfiler")
library("org.Hs.eg.db") 
library("enrichplot")   
library("msigdbr")
library("biomaRt")
library(grid)
library(scales)
library(ggrepel)
library(gtsummary)
library(gt)
library(MetBrewer)
library(eulerr)
library(ggthemes)

Differential Expression analysis

define functions

fc_threshold = 1
get_results <- function(contrast, qlf, p_value = 0.05, n_top = Inf) {
  # identify significant DE genes
  is.de <- decideTests(qlf, p.value = p_value)
  summary_de <- summary(is.de)
  top_tags <- topTags(qlf, n = n_top)
  upregulated <- sum(is.de == 1)
  downregulated <- sum(is.de == -1)
  no_change <- sum(is.de == 0)
  
  # apply a threshold for DE genes
  thresholded_results <- top_tags$table[
  top_tags$table$FDR <= 0.05 & abs(top_tags$table$logFC) >= fc_threshold, ]
  
  return(list(
    contrast = contrast,
    qlf = qlf,
    is_de = is.de,
    summary_de = summary_de,
    top_tags = top_tags,
    upregulated = upregulated,
    downregulated = downregulated,
    no_change = no_change,
    thresholded_results = thresholded_results
  ))
}

# Function to plot multiple group comparisons
plot_all_results <- function(results_list) {
  plot_data <- do.call(rbind, lapply(results_list, function(result) {
    data.frame(
      Category = c("Upregulated", "Downregulated", "No Change"),
      Count = c(result$upregulated, result$downregulated, result$no_change),
      Comparison = result$contrast
    )
  }))
  
  plot <- ggplot(plot_data, aes(x = Comparison, y = Count, fill = Category)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_text(aes(label = Count), 
              position = position_dodge(width = 0.9), 
              vjust = -0.5) +
    labs(title = "Differential Expression Across Comparisons",
         x = "Comparison",
         y = "Number of Genes") +
    theme_minimal() +
    scale_fill_manual(values = c("Upregulated" = "green", "Downregulated" = "red", "No Change" = "gray"))

  print(plot)
}

plot_thresholded_results <- function(results_list) {
  plot_data <- lapply(results_list, function(result) {
    thresholded_results <- result$thresholded_results
    # Count the number of upregulated genes (logFC >= 1.0)
    upregulated_genes <- thresholded_results[thresholded_results$logFC >= fc_threshold & thresholded_results$FDR <= 0.05, ]
    num_upregulated <- nrow(upregulated_genes)
    # Count the number of downregulated genes (logFC <= -1.0)
    downregulated_genes <- thresholded_results[thresholded_results$logFC <= -fc_threshold & thresholded_results$FDR <= 0.05, ]
    num_downregulated <- nrow(downregulated_genes)
    
    all <- num_downregulated + num_upregulated
   
    data.frame(
      Category = c("Upregulated", "Downregulated", "ALL"),
      Count = c(num_upregulated, num_downregulated, all),
      Comparison = result$contrast
    )
  })

  plot_data <- do.call(rbind, plot_data)
  plot <- ggplot(plot_data, aes(x = Comparison, y = Count, fill = Category)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_text(aes(label = Count), 
              position = position_dodge(width = 0.9), 
              vjust = -0.5) +  # Adds numbers on top of bars
    labs(title = "Differential Expression Across Comparisons (padj <= 0.05 or |logFC| >= 0.5) ",
         x = "Comparison",
         y = "Number of Genes") +
    theme_minimal() +
    scale_fill_manual(values = c("Upregulated" = "green", "Downregulated" = "red", "ALL" = "gray"))
  
  print(plot)
  return(plot_data)
}

custom_rollmean <- function(x, k = 40) {
  n <- length(x)
  half_window <- k / 2
  
  rolling_avg <- rep(NA, n)  # Initialize the rolling average vector
  
  # First half (start) of the series: fewer previous values
  for (i in 1:half_window) {
    rolling_avg[i] <- mean(x[1:(i + half_window)], na.rm = TRUE)
  }
  
  # Middle part: use full window size
  for (i in (half_window + 1):(n - half_window)) {
    rolling_avg[i] <- mean(x[(i - half_window):(i + half_window)], na.rm = TRUE)
  }
  
  # Last half (end) of the series: fewer subsequent values
  for (i in (n - half_window + 1):n) {
    rolling_avg[i] <- mean(x[(i - half_window):n], na.rm = TRUE)
  }
  
  return(rolling_avg)
}
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")

# region of interest
chromosome_of_interest <- "8"
deletion_start <- 0
deletion_end <- 7247573
duplication_start <- 11828865
duplication_end <- 40361770

attributes <- c(
  "ensembl_gene_id",
  "external_gene_name",
  "chromosome_name",
  "start_position",
  "end_position",
  "strand",
  "gene_biotype",
  "description"
)

# deletion region
filters_del <- c("chromosome_name", "start", "end")
values_del <- list(chromosome_of_interest, deletion_start, deletion_end)

genes_in_deletion <- getBM(
  attributes = attributes,
  filters = filters_del,
  values = values_del,
  mart = ensembl
)

protein_coding_genes_del <- genes_in_deletion[genes_in_deletion$gene_biotype == "protein_coding", ]
number_of_genes_del <- length(unique(protein_coding_genes_del$ensembl_gene_id))
cat("Number of protein-coding genes in the deletion region:", number_of_genes_del, "\n")
## Number of protein-coding genes in the deletion region: 23
# duplication region
filters_dup <- c("chromosome_name", "start", "end")
values_dup <- list(chromosome_of_interest, duplication_start, duplication_end)

genes_in_duplication <- getBM(
  attributes = attributes,
  filters = filters_dup,
  values = values_dup,
  mart = ensembl
)

protein_coding_genes_dup <- genes_in_duplication[genes_in_duplication$gene_biotype == "protein_coding", ]
number_of_genes_dup <- length(unique(protein_coding_genes_dup$ensembl_gene_id))
cat("Number of protein-coding genes in the duplication region:", number_of_genes_dup, "\n")
## Number of protein-coding genes in the duplication region: 172
protein_coding_genes_del$region <- "Deletion"
protein_coding_genes_dup$region <- "Duplication"
combined_genes <- rbind(protein_coding_genes_del, protein_coding_genes_dup)
combined_genes
ensembl_gene_id external_gene_name chromosome_name start_position end_position strand gene_biotype description region
6 ENSG00000176269 OR4F21 8 166086 167024 -1 protein_coding olfactory receptor family 4 subfamily F member 21 [Source:HGNC Symbol;Acc:HGNC:19583] Deletion
10 ENSG00000172748 ZNF596 8 232137 264703 1 protein_coding zinc finger protein 596 [Source:HGNC Symbol;Acc:HGNC:27268] Deletion
21 ENSG00000147364 FBXO25 8 406428 477967 1 protein_coding F-box protein 25 [Source:HGNC Symbol;Acc:HGNC:13596] Deletion
24 ENSG00000180190 TDRP 8 489803 545781 -1 protein_coding testis development related protein [Source:HGNC Symbol;Acc:HGNC:26951] Deletion
26 ENSG00000104714 ERICH1 8 614746 738106 -1 protein_coding glutamate rich 1 [Source:HGNC Symbol;Acc:HGNC:27234] Deletion
31 ENSG00000198010 DLGAP2 8 737628 1708476 1 protein_coding DLG associated protein 2 [Source:HGNC Symbol;Acc:HGNC:2906] Deletion
48 ENSG00000182372 CLN8 8 1755778 1801711 1 protein_coding CLN8 transmembrane ER and ERGIC protein [Source:HGNC Symbol;Acc:HGNC:2079] Deletion
50 ENSG00000283239 KBTBD11-OT1 8 1763888 1958627 1 protein_coding KBTBD11 overlapping transcript 1 [Source:NCBI gene (formerly Entrezgene);Acc:104266957] Deletion
54 ENSG00000104728 ARHGEF10 8 1823926 1958641 1 protein_coding Rho guanine nucleotide exchange factor 10 [Source:HGNC Symbol;Acc:HGNC:14103] Deletion
60 ENSG00000176595 KBTBD11 8 1973677 2006936 1 protein_coding kelch repeat and BTB domain containing 11 [Source:HGNC Symbol;Acc:HGNC:29104] Deletion
64 ENSG00000036448 MYOM2 8 2045046 2165552 1 protein_coding myomesin 2 [Source:HGNC Symbol;Acc:HGNC:7614] Deletion
85 ENSG00000183117 CSMD1 8 2935353 4994972 -1 protein_coding CUB and Sushi multiple domains 1 [Source:HGNC Symbol;Acc:HGNC:14026] Deletion
121 ENSG00000147316 MCPH1 8 6406592 6648508 1 protein_coding microcephalin 1 [Source:HGNC Symbol;Acc:HGNC:6954] Deletion
123 ENSG00000091879 ANGPT2 8 6499632 6563409 -1 protein_coding angiopoietin 2 [Source:HGNC Symbol;Acc:HGNC:485] Deletion
130 ENSG00000155189 AGPAT5 8 6708642 6761503 1 protein_coding 1-acylglycerol-3-phosphate O-acyltransferase 5 [Source:HGNC Symbol;Acc:HGNC:20886] Deletion
137 ENSG00000275591 XKR5 8 6808517 6835524 -1 protein_coding XK related 5 [Source:HGNC Symbol;Acc:HGNC:20782] Deletion
141 ENSG00000164825 DEFB1 8 6870592 6877936 -1 protein_coding defensin beta 1 [Source:HGNC Symbol;Acc:HGNC:2766] Deletion
144 ENSG00000164822 DEFA6 8 6924697 6926076 -1 protein_coding defensin alpha 6 [Source:HGNC Symbol;Acc:HGNC:2765] Deletion
147 ENSG00000164821 DEFA4 8 6935820 6938306 -1 protein_coding defensin alpha 4 [Source:HGNC Symbol;Acc:HGNC:2763] Deletion
152 ENSG00000206047 DEFA1 8 6977649 6980092 -1 protein_coding defensin alpha 1 [Source:HGNC Symbol;Acc:HGNC:2761] Deletion
155 ENSG00000240247 DEFA1B 8 6996766 6999198 -1 protein_coding defensin alpha 1B [Source:HGNC Symbol;Acc:HGNC:33596] Deletion
158 ENSG00000239839 DEFA3 8 7015869 7018297 -1 protein_coding defensin alpha 3 [Source:HGNC Symbol;Acc:HGNC:2762] Deletion
161 ENSG00000164816 DEFA5 8 7055304 7056739 -1 protein_coding defensin alpha 5 [Source:HGNC Symbol;Acc:HGNC:2764] Deletion
1 ENSG00000079459 FDFT1 8 11795573 11839395 1 protein_coding farnesyl-diphosphate farnesyltransferase 1 [Source:HGNC Symbol;Acc:HGNC:3629] Duplication
2 ENSG00000164733 CTSB 8 11842524 11869533 -1 protein_coding cathepsin B [Source:HGNC Symbol;Acc:HGNC:2527] Duplication
101 ENSG00000205884 DEFB136 8 11973937 11974599 -1 protein_coding defensin beta 136 [Source:HGNC Symbol;Acc:HGNC:34433] Duplication
11 ENSG00000205883 DEFB135 8 11982256 11984590 1 protein_coding defensin beta 135 [Source:HGNC Symbol;Acc:HGNC:32400] Duplication
12 ENSG00000205882 DEFB134 8 11993174 12000752 -1 protein_coding defensin beta 134 [Source:HGNC Symbol;Acc:HGNC:32399] Duplication
20 ENSG00000233050 DEFB130B 8 12064389 12071747 -1 protein_coding defensin beta 130B [Source:HGNC Symbol;Acc:HGNC:39814] Duplication
22 ENSG00000215343 ZNF705D 8 12089338 12115516 1 protein_coding zinc finger protein 705D [Source:HGNC Symbol;Acc:HGNC:33202] Duplication
262 ENSG00000226430 USP17L7 8 12132417 12134099 -1 protein_coding ubiquitin specific peptidase 17 like family member 7 [Source:HGNC Symbol;Acc:HGNC:37180] Duplication
27 ENSG00000223443 USP17L2 8 12136435 12138849 -1 protein_coding ubiquitin specific peptidase 17 like family member 2 [Source:HGNC Symbol;Acc:HGNC:34434] Duplication
28 ENSG00000254866 DEFB109D 8 12150888 12158033 -1 protein_coding defensin beta 109D (pseudogene) [Source:HGNC Symbol;Acc:HGNC:30838] Duplication
311 ENSG00000186523 FAM86B1 8 12182096 12194133 -1 protein_coding family with sequence similarity 86 member B1 [Source:HGNC Symbol;Acc:HGNC:28268] Duplication
35 ENSG00000232948 DEFB130A 8 12310962 12318316 -1 protein_coding defensin beta 130A [Source:HGNC Symbol;Acc:HGNC:18107] Duplication
47 ENSG00000145002 FAM86B2 8 12424411 12436406 -1 protein_coding family with sequence similarity 86 member B2 [Source:HGNC Symbol;Acc:HGNC:32222] Duplication
65 ENSG00000154359 LONRF1 8 12721906 12756073 -1 protein_coding LON peptidase N-terminal domain and ring finger 1 [Source:HGNC Symbol;Acc:HGNC:26302] Duplication
73 ENSG00000250305 TRMT9B 8 12945642 13031503 1 protein_coding tRNA methyltransferase 9B (putative) [Source:HGNC Symbol;Acc:HGNC:26725] Duplication
78 ENSG00000164741 DLC1 8 13083361 13604610 -1 protein_coding DLC1 Rho GTPase activating protein [Source:HGNC Symbol;Acc:HGNC:2897] Duplication
84 ENSG00000164743 C8orf48 8 13566869 13568288 1 protein_coding chromosome 8 open reading frame 48 [Source:HGNC Symbol;Acc:HGNC:26345] Duplication
95 ENSG00000185053 SGCZ 8 14084845 15238431 -1 protein_coding sarcoglycan zeta [Source:HGNC Symbol;Acc:HGNC:14075] Duplication
110 ENSG00000104723 TUSC3 8 15417215 15766649 1 protein_coding tumor suppressor candidate 3 [Source:HGNC Symbol;Acc:HGNC:30242] Duplication
117 ENSG00000038945 MSR1 8 16107878 16567490 -1 protein_coding macrophage scavenger receptor 1 [Source:HGNC Symbol;Acc:HGNC:7376] Duplication
125 ENSG00000078579 FGF20 8 16992181 17002345 -1 protein_coding fibroblast growth factor 20 [Source:HGNC Symbol;Acc:HGNC:3677] Duplication
127 ENSG00000155970 MICU3 8 17027238 17125880 1 protein_coding mitochondrial calcium uptake family member 3 [Source:HGNC Symbol;Acc:HGNC:27820] Duplication
131 ENSG00000104219 ZDHHC2 8 17156482 17224799 1 protein_coding zinc finger DHHC-type palmitoyltransferase 2 [Source:HGNC Symbol;Acc:HGNC:18469] Duplication
132 ENSG00000198791 CNOT7 8 17224966 17246878 -1 protein_coding CCR4-NOT transcription complex subunit 7 [Source:HGNC Symbol;Acc:HGNC:14101] Duplication
134 ENSG00000155975 VPS37A 8 17246931 17302427 1 protein_coding VPS37A subunit of ESCRT-I [Source:HGNC Symbol;Acc:HGNC:24928] Duplication
136 ENSG00000003987 MTMR7 8 17296794 17413528 -1 protein_coding myotubularin related protein 7 [Source:HGNC Symbol;Acc:HGNC:7454] Duplication
1411 ENSG00000003989 SLC7A2 8 17497088 17570573 1 protein_coding solute carrier family 7 member 2 [Source:HGNC Symbol;Acc:HGNC:11060] Duplication
146 ENSG00000104213 PDGFRL 8 17576433 17644071 1 protein_coding platelet derived growth factor receptor like [Source:HGNC Symbol;Acc:HGNC:8805] Duplication
149 ENSG00000129422 MTUS1 8 17643795 17801094 -1 protein_coding microtubule associated scaffold protein 1 [Source:HGNC Symbol;Acc:HGNC:29789] Duplication
156 ENSG00000104760 FGL1 8 17864380 17910365 -1 protein_coding fibrinogen like 1 [Source:HGNC Symbol;Acc:HGNC:3695] Duplication
159 ENSG00000078674 PCM1 8 17922842 18029948 1 protein_coding pericentriolar material 1 [Source:HGNC Symbol;Acc:HGNC:8727] Duplication
1611 ENSG00000104763 ASAH1 8 18055992 18084998 -1 protein_coding N-acylsphingosine amidohydrolase 1 [Source:HGNC Symbol;Acc:HGNC:735] Duplication
164 ENSG00000171428 NAT1 8 18170477 18223689 1 protein_coding N-acetyltransferase 1 [Source:HGNC Symbol;Acc:HGNC:7645] Duplication
171 ENSG00000156006 NAT2 8 18391282 18401218 1 protein_coding N-acetyltransferase 2 [Source:HGNC Symbol;Acc:HGNC:7646] Duplication
172 ENSG00000156011 PSD3 8 18527303 19084730 -1 protein_coding pleckstrin and Sec7 domain containing 3 [Source:HGNC Symbol;Acc:HGNC:19093] Duplication
190 ENSG00000104611 SH2D4A 8 19313693 19396218 1 protein_coding SH2 domain containing 4A [Source:HGNC Symbol;Acc:HGNC:26102] Duplication
192 ENSG00000147408 CSGALNACT1 8 19404161 19758029 -1 protein_coding chondroitin sulfate N-acetylgalactosaminyltransferase 1 [Source:HGNC Symbol;Acc:HGNC:24290] Duplication
198 ENSG00000104613 INTS10 8 19817391 19852083 1 protein_coding integrator complex subunit 10 [Source:HGNC Symbol;Acc:HGNC:25548] Duplication
199 ENSG00000175445 LPL 8 19901717 19967259 1 protein_coding lipoprotein lipase [Source:HGNC Symbol;Acc:HGNC:6677] Duplication
204 ENSG00000036565 SLC18A1 8 20144855 20183206 -1 protein_coding solute carrier family 18 member A1 [Source:HGNC Symbol;Acc:HGNC:10934] Duplication
206 ENSG00000147416 ATP6V1B2 8 20197381 20230399 1 protein_coding ATPase H+ transporting V1 subunit B2 [Source:HGNC Symbol;Acc:HGNC:854] Duplication
209 ENSG00000061337 LZTS1 8 20246165 20303963 -1 protein_coding leucine zipper tumor suppressor 1 [Source:HGNC Symbol;Acc:HGNC:13861] Duplication
235 ENSG00000168546 GFRA2 8 21690398 21812357 -1 protein_coding GDNF family receptor alpha 2 [Source:HGNC Symbol;Acc:HGNC:4244] Duplication
237 ENSG00000147443 DOK2 8 21908873 21913690 -1 protein_coding docking protein 2 [Source:HGNC Symbol;Acc:HGNC:2991] Duplication
238 ENSG00000130227 XPO7 8 21919662 22006585 1 protein_coding exportin 7 [Source:HGNC Symbol;Acc:HGNC:14108] Duplication
241 ENSG00000158806 NPM2 8 22024125 22036897 1 protein_coding nucleophosmin/nucleoplasmin 2 [Source:HGNC Symbol;Acc:HGNC:7930] Duplication
242 ENSG00000158815 FGF17 8 22042398 22048809 1 protein_coding fibroblast growth factor 17 [Source:HGNC Symbol;Acc:HGNC:3673] Duplication
243 ENSG00000158856 DMTN 8 22048995 22082527 1 protein_coding dematin actin binding protein [Source:HGNC Symbol;Acc:HGNC:3382] Duplication
245 ENSG00000158863 FHIP2B 8 22089150 22104911 1 protein_coding FHF complex subunit HOOK interacting protein 2B [Source:HGNC Symbol;Acc:HGNC:16492] Duplication
246 ENSG00000275074 NUDT18 8 22105748 22109419 -1 protein_coding nudix hydrolase 18 [Source:HGNC Symbol;Acc:HGNC:26194] Duplication
247 ENSG00000168453 HR 8 22114419 22133384 -1 protein_coding HR lysine demethylase and nuclear receptor corepressor [Source:HGNC Symbol;Acc:HGNC:5172] Duplication
248 ENSG00000288677 HRURF 8 22130458 22131010 -1 protein_coding HR upstream open reading frame [Source:HGNC Symbol;Acc:HGNC:55085] Duplication
249 ENSG00000168476 REEP4 8 22138020 22141951 -1 protein_coding receptor accessory protein 4 [Source:HGNC Symbol;Acc:HGNC:26176] Duplication
251 ENSG00000168481 LGI3 8 22146830 22157084 -1 protein_coding leucine rich repeat LGI family member 3 [Source:HGNC Symbol;Acc:HGNC:18711] Duplication
252 ENSG00000168484 SFTPC 8 22156913 22164479 1 protein_coding surfactant protein C [Source:HGNC Symbol;Acc:HGNC:10802] Duplication
253 ENSG00000168487 BMP1 8 22165140 22212326 1 protein_coding bone morphogenetic protein 1 [Source:HGNC Symbol;Acc:HGNC:1067] Duplication
256 ENSG00000168490 PHYHIP 8 22219703 22232101 -1 protein_coding phytanoyl-CoA 2-hydroxylase interacting protein [Source:HGNC Symbol;Acc:HGNC:16865] Duplication
259 ENSG00000168495 POLR3D 8 22245133 22254601 1 protein_coding RNA polymerase III subunit D [Source:HGNC Symbol;Acc:HGNC:1080] Duplication
261 ENSG00000197181 PIWIL2 8 22275316 22357568 1 protein_coding piwi like RNA-mediated gene silencing 2 [Source:HGNC Symbol;Acc:HGNC:17644] Duplication
263 ENSG00000104635 SLC39A14 8 22367278 22434129 1 protein_coding solute carrier family 39 member 14 [Source:HGNC Symbol;Acc:HGNC:20858] Duplication
266 ENSG00000120910 PPP3CC 8 22440819 22541142 1 protein_coding protein phosphatase 3 catalytic subunit gamma [Source:HGNC Symbol;Acc:HGNC:9316] Duplication
269 ENSG00000120896 SORBS3 8 22544986 22575788 1 protein_coding sorbin and SH3 domain containing 3 [Source:HGNC Symbol;Acc:HGNC:30907] Duplication
273 ENSG00000120913 PDLIM2 8 22578279 22598025 1 protein_coding PDZ and LIM domain 2 [Source:HGNC Symbol;Acc:HGNC:13992] Duplication
274 ENSG00000248235 8 22589274 22602084 1 protein_coding novel protein Duplication
275 ENSG00000241852 C8orf58 8 22599599 22604150 1 protein_coding chromosome 8 open reading frame 58 [Source:HGNC Symbol;Acc:HGNC:32233] Duplication
277 ENSG00000158941 CCAR2 8 22604757 22620964 1 protein_coding cell cycle and apoptosis regulator 2 [Source:HGNC Symbol;Acc:HGNC:23360] Duplication
279 ENSG00000147439 BIN3 8 22620418 22669148 -1 protein_coding bridging integrator 3 [Source:HGNC Symbol;Acc:HGNC:1054] Duplication
283 ENSG00000179388 EGR3 8 22687659 22693480 -1 protein_coding early growth response 3 [Source:HGNC Symbol;Acc:HGNC:3240] Duplication
288 ENSG00000134020 PEBP4 8 22713251 23000000 -1 protein_coding phosphatidylethanolamine binding protein 4 [Source:HGNC Symbol;Acc:HGNC:28319] Duplication
297 ENSG00000008853 RHOBTB2 8 22987417 23020509 1 protein_coding Rho related BTB domain containing 2 [Source:HGNC Symbol;Acc:HGNC:18756] Duplication
298 ENSG00000120889 TNFRSF10B 8 23020133 23069031 -1 protein_coding TNF receptor superfamily member 10b [Source:HGNC Symbol;Acc:HGNC:11905] Duplication
302 ENSG00000284956 8 23084403 23115536 1 protein_coding novel protein Duplication
303 ENSG00000173535 TNFRSF10C 8 23102921 23117445 1 protein_coding TNF receptor superfamily member 10c [Source:HGNC Symbol;Acc:HGNC:11906] Duplication
304 ENSG00000173530 TNFRSF10D 8 23135588 23164027 -1 protein_coding TNF receptor superfamily member 10d [Source:HGNC Symbol;Acc:HGNC:11907] Duplication
307 ENSG00000104689 TNFRSF10A 8 23190452 23225102 -1 protein_coding TNF receptor superfamily member 10a [Source:HGNC Symbol;Acc:HGNC:11904] Duplication
312 ENSG00000147457 CHMP7 8 23243637 23262000 1 protein_coding charged multivesicular body protein 7 [Source:HGNC Symbol;Acc:HGNC:28439] Duplication
314 ENSG00000104679 R3HCC1 8 23270120 23296279 1 protein_coding R3H domain and coiled-coil containing 1 [Source:HGNC Symbol;Acc:HGNC:27329] Duplication
316 ENSG00000134013 LOXL2 8 23296897 23425328 -1 protein_coding lysyl oxidase like 2 [Source:HGNC Symbol;Acc:HGNC:6666] Duplication
318 ENSG00000197217 ENTPD4 8 23385783 23457695 -1 protein_coding ectonucleoside triphosphate diphosphohydrolase 4 [Source:HGNC Symbol;Acc:HGNC:14573] Duplication
326 ENSG00000147454 SLC25A37 8 23528956 23575463 1 protein_coding solute carrier family 25 member 37 [Source:HGNC Symbol;Acc:HGNC:29786] Duplication
330 ENSG00000167034 NKX3-1 8 23678697 23682938 -1 protein_coding NK3 homeobox 1 [Source:HGNC Symbol;Acc:HGNC:7838] Duplication
331 ENSG00000180053 NKX2-6 8 23701740 23706756 -1 protein_coding NK2 homeobox 6 [Source:HGNC Symbol;Acc:HGNC:32940] Duplication
338 ENSG00000159167 STC1 8 23841929 23854806 -1 protein_coding stanniocalcin 1 [Source:HGNC Symbol;Acc:HGNC:11373] Duplication
343 ENSG00000042980 ADAM28 8 24294069 24359014 1 protein_coding ADAM metallopeptidase domain 28 [Source:HGNC Symbol;Acc:HGNC:206] Duplication
344 ENSG00000134028 ADAMDEC1 8 24384285 24406013 1 protein_coding ADAM like decysin 1 [Source:HGNC Symbol;Acc:HGNC:16299] Duplication
345 ENSG00000069206 ADAM7 8 24440930 24526970 1 protein_coding ADAM metallopeptidase domain 7 [Source:HGNC Symbol;Acc:HGNC:214] Duplication
349 ENSG00000104722 NEFM 8 24913758 24919098 1 protein_coding neurofilament medium chain [Source:HGNC Symbol;Acc:HGNC:7734] Duplication
351 ENSG00000277586 NEFL 8 24950955 24956721 -1 protein_coding neurofilament light chain [Source:HGNC Symbol;Acc:HGNC:7739] Duplication
362 ENSG00000147459 DOCK5 8 25184689 25418082 1 protein_coding dedicator of cytokinesis 5 [Source:HGNC Symbol;Acc:HGNC:23476] Duplication
366 ENSG00000147437 GNRH1 8 25419258 25424654 -1 protein_coding gonadotropin releasing hormone 1 [Source:HGNC Symbol;Acc:HGNC:4419] Duplication
368 ENSG00000104756 KCTD9 8 25427847 25458476 -1 protein_coding potassium channel tetramerization domain containing 9 [Source:HGNC Symbol;Acc:HGNC:22401] Duplication
369 ENSG00000184661 CDCA2 8 25459199 25507911 1 protein_coding cell division cycle associated 2 [Source:HGNC Symbol;Acc:HGNC:14623] Duplication
377 ENSG00000221818 EBF2 8 25841725 26045413 -1 protein_coding EBF transcription factor 2 [Source:HGNC Symbol;Acc:HGNC:19090] Duplication
386 ENSG00000221914 PPP2R2A 8 26291508 26372680 1 protein_coding protein phosphatase 2 regulatory subunit Balpha [Source:HGNC Symbol;Acc:HGNC:9304] Duplication
389 ENSG00000104765 BNIP3L 8 26383054 26505636 1 protein_coding BCL2 interacting protein 3 like [Source:HGNC Symbol;Acc:HGNC:1085] Duplication
394 ENSG00000240694 PNMA2 8 26504701 26514092 -1 protein_coding PNMA family member 2 [Source:HGNC Symbol;Acc:HGNC:9159] Duplication
395 ENSG00000092964 DPYSL2 8 26514031 26658178 1 protein_coding dihydropyrimidinase like 2 [Source:HGNC Symbol;Acc:HGNC:3014] Duplication
399 ENSG00000120907 ADRA1A 8 26748150 26867278 -1 protein_coding adrenoceptor alpha 1A [Source:HGNC Symbol;Acc:HGNC:277] Duplication
411 ENSG00000015592 STMN4 8 27235308 27258420 -1 protein_coding stathmin 4 [Source:HGNC Symbol;Acc:HGNC:16078] Duplication
412 ENSG00000104228 TRIM35 8 27284886 27311272 -1 protein_coding tripartite motif containing 35 [Source:HGNC Symbol;Acc:HGNC:16285] Duplication
413 ENSG00000120899 PTK2B 8 27311482 27459391 1 protein_coding protein tyrosine kinase 2 beta [Source:HGNC Symbol;Acc:HGNC:9612] Duplication
415 ENSG00000120903 CHRNA2 8 27459756 27479883 -1 protein_coding cholinergic receptor nicotinic alpha 2 subunit [Source:HGNC Symbol;Acc:HGNC:1956] Duplication
416 ENSG00000120915 EPHX2 8 27490781 27548615 1 protein_coding epoxide hydrolase 2 [Source:HGNC Symbol;Acc:HGNC:3402] Duplication
420 ENSG00000120885 CLU 8 27596917 27614700 -1 protein_coding clusterin [Source:HGNC Symbol;Acc:HGNC:2095] Duplication
422 ENSG00000168077 SCARA3 8 27633868 27676776 1 protein_coding scavenger receptor class A member 3 [Source:HGNC Symbol;Acc:HGNC:19000] Duplication
430 ENSG00000147419 CCDC25 8 27733316 27772653 -1 protein_coding coiled-coil domain containing 25 [Source:HGNC Symbol;Acc:HGNC:25591] Duplication
431 ENSG00000171320 ESCO2 8 27771949 27812640 1 protein_coding establishment of sister chromatid cohesion N-acetyltransferase 2 [Source:HGNC Symbol;Acc:HGNC:27230] Duplication
433 ENSG00000168078 PBK 8 27809624 27838082 -1 protein_coding PDZ binding kinase [Source:HGNC Symbol;Acc:HGNC:18282] Duplication
436 ENSG00000168079 SCARA5 8 27869883 27992673 -1 protein_coding scavenger receptor class A member 5 [Source:HGNC Symbol;Acc:HGNC:28701] Duplication
441 ENSG00000189233 NUGGC 8 28021964 28083936 -1 protein_coding nuclear GTPase, germinal center associated [Source:HGNC Symbol;Acc:HGNC:33550] Duplication
442 ENSG00000134014 ELP3 8 28089673 28191156 1 protein_coding elongator acetyltransferase complex subunit 3 [Source:HGNC Symbol;Acc:HGNC:20696] Duplication
449 ENSG00000168081 PNOC 8 28316986 28343355 1 protein_coding prepronociceptin [Source:HGNC Symbol;Acc:HGNC:9163] Duplication
450 ENSG00000186918 ZNF395 8 28345590 28402701 -1 protein_coding zinc finger protein 395 [Source:HGNC Symbol;Acc:HGNC:18737] Duplication
451 ENSG00000214050 FBXO16 8 28348287 28490278 -1 protein_coding F-box protein 16 [Source:HGNC Symbol;Acc:HGNC:13618] Duplication
457 ENSG00000104290 FZD3 8 28494205 28574267 1 protein_coding frizzled class receptor 3 [Source:HGNC Symbol;Acc:HGNC:4041] Duplication
461 ENSG00000012232 EXTL3 8 28600469 28756561 1 protein_coding exostosin like glycosyltransferase 3 [Source:HGNC Symbol;Acc:HGNC:3518] Duplication
465 ENSG00000104299 INTS9 8 28767661 28890242 -1 protein_coding integrator complex subunit 9 [Source:HGNC Symbol;Acc:HGNC:25592] Duplication
468 ENSG00000147421 HMBOX1 8 28890395 29064764 1 protein_coding homeobox containing 1 [Source:HGNC Symbol;Acc:HGNC:26137] Duplication
475 ENSG00000197892 KIF13B 8 29067278 29263124 -1 protein_coding kinesin family member 13B [Source:HGNC Symbol;Acc:HGNC:14405] Duplication
481 ENSG00000120875 DUSP4 8 29333064 29350684 -1 protein_coding dual specificity phosphatase 4 [Source:HGNC Symbol;Acc:HGNC:3070] Duplication
512 ENSG00000133872 SARAF 8 30063003 30083208 -1 protein_coding store-operated calcium entry associated regulatory factor [Source:HGNC Symbol;Acc:HGNC:28789] Duplication
515 ENSG00000104660 LEPROTL1 8 30095408 30177208 1 protein_coding leptin receptor overlapping transcript like 1 [Source:HGNC Symbol;Acc:HGNC:6555] Duplication
517 ENSG00000177669 MBOAT4 8 30131671 30144665 -1 protein_coding membrane bound O-acyltransferase domain containing 4 [Source:HGNC Symbol;Acc:HGNC:32311] Duplication
519 ENSG00000104671 DCTN6 8 30156319 30183639 1 protein_coding dynactin subunit 6 [Source:HGNC Symbol;Acc:HGNC:16964] Duplication
535 ENSG00000157110 RBPMS 8 30384511 30572256 1 protein_coding RNA binding protein, mRNA processing factor [Source:HGNC Symbol;Acc:HGNC:19097] Duplication
539 ENSG00000197265 GTF2E2 8 30578318 30658236 -1 protein_coding general transcription factor IIE subunit 2 [Source:HGNC Symbol;Acc:HGNC:4651] Duplication
541 ENSG00000253457 SMIM18 8 30638580 30646064 1 protein_coding small integral membrane protein 18 [Source:HGNC Symbol;Acc:HGNC:42973] Duplication
543 ENSG00000104687 GSR 8 30678066 30727846 -1 protein_coding glutathione-disulfide reductase [Source:HGNC Symbol;Acc:HGNC:4623] Duplication
544 ENSG00000104691 UBXN8 8 30729131 30767006 1 protein_coding UBX domain protein 8 [Source:HGNC Symbol;Acc:HGNC:30307] Duplication
546 ENSG00000104695 PPP2CB 8 30774457 30814314 -1 protein_coding protein phosphatase 2 catalytic subunit beta [Source:HGNC Symbol;Acc:HGNC:9300] Duplication
547 ENSG00000133863 TEX15 8 30831544 30913008 -1 protein_coding testis expressed 15, meiosis and synapsis associated [Source:HGNC Symbol;Acc:HGNC:11738] Duplication
551 ENSG00000172733 PURG 8 30995802 31033715 -1 protein_coding purine rich element binding protein G [Source:HGNC Symbol;Acc:HGNC:17930] Duplication
552 ENSG00000165392 WRN 8 31033788 31176138 1 protein_coding WRN RecQ like helicase [Source:HGNC Symbol;Acc:HGNC:12791] Duplication
563 ENSG00000157168 NRG1 8 31639222 32855666 1 protein_coding neuregulin 1 [Source:HGNC Symbol;Acc:HGNC:7997] Duplication
571 ENSG00000286131 8 32647202 32647390 1 protein_coding novel protein Duplication
582 ENSG00000172728 FUT10 8 33370824 33473146 -1 protein_coding fucosyltransferase 10 [Source:HGNC Symbol;Acc:HGNC:19234] Duplication
585 ENSG00000129696 TTI2 8 33473386 33513185 -1 protein_coding TELO2 interacting protein 2 [Source:HGNC Symbol;Acc:HGNC:26262] Duplication
586 ENSG00000198042 MAK16 8 33485182 33501262 1 protein_coding MAK16 homolog [Source:HGNC Symbol;Acc:HGNC:13703] Duplication
592 ENSG00000133874 RNF122 8 33547754 33567128 -1 protein_coding ring finger protein 122 [Source:HGNC Symbol;Acc:HGNC:21147] Duplication
594 ENSG00000133878 DUSP26 8 33591330 33600023 -1 protein_coding dual specificity phosphatase 26 [Source:HGNC Symbol;Acc:HGNC:28161] Duplication
615 ENSG00000156687 UNC5D 8 35235475 35796550 1 protein_coding unc-5 netrin receptor D [Source:HGNC Symbol;Acc:HGNC:18634] Duplication
633 ENSG00000215262 KCNU1 8 36784324 36936125 1 protein_coding potassium calcium-activated channel subfamily U member 1 [Source:HGNC Symbol;Acc:HGNC:18867] Duplication
659 ENSG00000183779 ZNF703 8 37695782 37700019 1 protein_coding zinc finger protein 703 [Source:HGNC Symbol;Acc:HGNC:25883] Duplication
663 ENSG00000147475 ERLIN2 8 37736601 37758422 1 protein_coding ER lipid raft associated 2 [Source:HGNC Symbol;Acc:HGNC:1356] Duplication
666 ENSG00000147471 PLPBP 8 37762595 37779768 1 protein_coding pyridoxal phosphate binding protein [Source:HGNC Symbol;Acc:HGNC:9457] Duplication
668 ENSG00000020181 ADGRA2 8 37784191 37844896 1 protein_coding adhesion G protein-coupled receptor A2 [Source:HGNC Symbol;Acc:HGNC:17849] Duplication
670 ENSG00000104221 BRF2 8 37843268 37849861 -1 protein_coding BRF2 RNA polymerase III transcription initiation factor subunit [Source:HGNC Symbol;Acc:HGNC:17298] Duplication
671 ENSG00000156675 RAB11FIP1 8 37858618 37899497 -1 protein_coding RAB11 family interacting protein 1 [Source:HGNC Symbol;Acc:HGNC:30265] Duplication
674 ENSG00000169154 GOT1L1 8 37934281 37940124 -1 protein_coding glutamic-oxaloacetic transaminase 1 like 1 [Source:HGNC Symbol;Acc:HGNC:28487] Duplication
675 ENSG00000285880 8 37934340 37965953 -1 protein_coding ADRB3-GOT1L1 readthrough Duplication
677 ENSG00000188778 ADRB3 8 37962990 37966599 -1 protein_coding adrenoceptor beta 3 [Source:HGNC Symbol;Acc:HGNC:288] Duplication
679 ENSG00000187840 EIF4EBP1 8 38030534 38060365 1 protein_coding eukaryotic translation initiation factor 4E binding protein 1 [Source:HGNC Symbol;Acc:HGNC:3288] Duplication
684 ENSG00000129691 ASH2L 8 38105493 38144076 1 protein_coding ASH2 like, histone lysine methyltransferase complex subunit [Source:HGNC Symbol;Acc:HGNC:744] Duplication
687 ENSG00000147465 STAR 8 38142700 38150992 -1 protein_coding steroidogenic acute regulatory protein [Source:HGNC Symbol;Acc:HGNC:11359] Duplication
689 ENSG00000175324 LSM1 8 38163335 38176730 -1 protein_coding LSM1 homolog, mRNA degradation associated [Source:HGNC Symbol;Acc:HGNC:20472] Duplication
691 ENSG00000156735 BAG4 8 38176533 38213301 1 protein_coding BAG cochaperone 4 [Source:HGNC Symbol;Acc:HGNC:940] Duplication
694 ENSG00000085788 DDHD2 8 38225218 38275558 1 protein_coding DDHD domain containing 2 [Source:HGNC Symbol;Acc:HGNC:29106] Duplication
695 ENSG00000147535 PLPP5 8 38263130 38269243 -1 protein_coding phospholipid phosphatase 5 [Source:HGNC Symbol;Acc:HGNC:25026] Duplication
696 ENSG00000147548 NSD3 8 38269704 38382272 -1 protein_coding nuclear receptor binding SET domain protein 3 [Source:HGNC Symbol;Acc:HGNC:12767] Duplication
701 ENSG00000165046 LETM2 8 38386207 38409527 1 protein_coding leucine zipper and EF-hand containing transmembrane protein 2 [Source:HGNC Symbol;Acc:HGNC:14648] Duplication
702 ENSG00000077782 FGFR1 8 38400215 38468834 -1 protein_coding fibroblast growth factor receptor 1 [Source:HGNC Symbol;Acc:HGNC:3688] Duplication
718 ENSG00000147526 TACC1 8 38728186 38853028 1 protein_coding transforming acidic coiled-coil containing protein 1 [Source:HGNC Symbol;Acc:HGNC:11522] Duplication
723 ENSG00000169499 PLEKHA2 8 38901235 38973912 1 protein_coding pleckstrin homology domain containing A2 [Source:HGNC Symbol;Acc:HGNC:14336] Duplication
725 ENSG00000169495 HTRA4 8 38974228 38988663 1 protein_coding HtrA serine peptidase 4 [Source:HGNC Symbol;Acc:HGNC:26909] Duplication
726 ENSG00000169490 TM2D2 8 38988808 38996824 -1 protein_coding TM2 domain containing 2 [Source:HGNC Symbol;Acc:HGNC:24127] Duplication
727 ENSG00000168615 ADAM9 8 38996754 39105445 1 protein_coding ADAM metallopeptidase domain 9 [Source:HGNC Symbol;Acc:HGNC:216] Duplication
729 ENSG00000197140 ADAM32 8 39106990 39284917 1 protein_coding ADAM metallopeptidase domain 32 [Source:HGNC Symbol;Acc:HGNC:15479] Duplication
739 ENSG00000168619 ADAM18 8 39584489 39730065 1 protein_coding ADAM metallopeptidase domain 18 [Source:HGNC Symbol;Acc:HGNC:196] Duplication
740 ENSG00000104755 ADAM2 8 39743735 39838227 -1 protein_coding ADAM metallopeptidase domain 2 [Source:HGNC Symbol;Acc:HGNC:198] Duplication
743 ENSG00000131203 IDO1 8 39902275 39928790 1 protein_coding indoleamine 2,3-dioxygenase 1 [Source:HGNC Symbol;Acc:HGNC:6059] Duplication
747 ENSG00000188676 IDO2 8 39934614 40016392 1 protein_coding indoleamine 2,3-dioxygenase 2 [Source:HGNC Symbol;Acc:HGNC:27269] Duplication
753 ENSG00000176907 TCIM 8 40153482 40155310 1 protein_coding transcriptional and immune response regulator [Source:HGNC Symbol;Acc:HGNC:1357] Duplication
overlapping_genes <- intersect(protein_coding_genes_del$ensembl_gene_id, protein_coding_genes_dup$ensembl_gene_id)
if (length(overlapping_genes) > 0) {
  cat("Genes present in both regions:\n")
  print(overlapping_genes)
} else {
  cat("No genes are present in both the deletion and duplication regions.\n")
}
## No genes are present in both the deletion and duplication regions.

8p analysis

counts <- read.csv("data/gene_count.csv")
samples <- read_xlsx("data/Sample List.xlsx")
raw.counts <- counts %>% dplyr::select(gene_id, samples$`Sample Name`)
annotation <- counts %>% dplyr::select(-all_of(samples$`Sample Name`))
samples.parents <- samples %>%
  mutate(`Group Name` = ifelse(`Group Name` %in% c("MOM", "DAD"), "Parents", `Group Name`))
print(samples.parents)
## # A tibble: 12 × 7
##    `Sample Name` `Cell line`    Concentration (ng/uL…¹ `Volume (uL)` `A260/A280`
##    <chr>         <chr>                           <dbl>         <dbl>       <dbl>
##  1 SL_1          31.3 (Sample …                  1194.            30        2.09
##  2 SL_2          31.3 (Sample …                  1068.            30        2.08
##  3 SL_3          31.3 (Sample …                  1097.            30        2.08
##  4 SL_4          JE01 214 (Sam…                   523.            30        2.00
##  5 SL_5          JE01 214 (Sam…                   762.            30        2.04
##  6 SL_6          JE01 214 (Sam…                   410.            30        1.96
##  7 SL_7          255-1 (p13)                      244.            30        2.08
##  8 SL_8          255-4 (p13)                      606.            30        2.09
##  9 SL_9          255-3 (p13)                      495.            30        2.06
## 10 SL_10         294-1                           1128.            30        2.07
## 11 SL_11         294-2                            750.            30        2.07
## 12 SL_12         294-3                            382.            30        2.09
## # ℹ abbreviated name: ¹​`Concentration (ng/uL)`
## # ℹ 2 more variables: `A260/A230` <dbl>, `Group Name` <chr>
y.parents <- DGEList(counts = raw.counts, samples = samples.parents, group = samples.parents$`Group Name`)
y.parents$genes <- annotation
keep <- filterByExpr(y.parents, min.count = 30, min.total.count = 50, large.n = 10, min.prop = 0.75) # filter lowly expressed transcripts #, min.count = 30, min.total.count = 50, large.n = 10, min.prop = 0.75
table(keep)
## keep
## FALSE  TRUE 
## 43987 14748
y.parents <- y.parents[keep, , keep.lib.sizes=FALSE]
y.parents <- normLibSizes(y.parents) # TMM normalization
# calculate CPM 
log2_cpm <- cpm(y.parents, log = TRUE, prior.count = 1, normalized.lib.sizes = T)
log2_tmm_data_with_annotations <- cbind(y.parents$genes, log2_cpm)

# DE analysis
plotMDS(y.parents)        

samples.design.parents <- model.matrix(~ 0 + group,data = y.parents$samples) # design
colnames(samples.design.parents) <- gsub("group", "", colnames(samples.design.parents))
y.parents <- estimateDisp(y.parents, samples.design.parents, robust=TRUE)
print(y.parents$common.dispersion)
## [1] 0.03257095
plotBCV(y.parents)

fit.parents <- glmQLFit(y.parents, samples.design.parents, robust=TRUE)
plotQLDisp(fit.parents)

# make contrast
rev.contrast <- makeContrasts(PROvsREV=PRO-REV, levels=samples.design.parents)

qlf.PROvsREV <- glmQLFTest(fit.parents, contrast=rev.contrast[,"PROvsREV"])

contr.rev <- get_results("PRO - REV", qlf.PROvsREV)

parents_results_list <- list(contr.rev)
plot_all_results(parents_results_list)

table.results <- plot_thresholded_results(parents_results_list)

Supplementary Figure S3BC

centromeres <- read_tsv("data/centromeres-UCSC.bed", comment = "#")

centromeres_summary <- centromeres %>%
  group_by(chrom) %>%
  summarize(
    start = min(chromStart),
    end = max(chromEnd)
  )

sample_to_group <- samples %>%
  dplyr::select(`Sample Name`,`Group Name`) %>%
  dplyr::filter(`Group Name` == "REV" |`Group Name` == "PRO")

gene_logfc <- contr.rev$top_tags$table %>%
  dplyr::select(gene_id, logFC, gene_chr)  # Select relevant columns (adjust the names if needed)

summary_list <- list()

for (i in 1:22) {
  chrom_gene_logfc <- gene_logfc %>%
    filter(gene_chr == as.character(i))
  
  avg_logfc <- mean(chrom_gene_logfc$logFC, na.rm = TRUE)
  median_logfc <- median(chrom_gene_logfc$logFC, na.rm = TRUE)
  
  summary_df <- tibble(
    CHR = i,
    mean_logFC = avg_logfc,
    median_logFC = median_logfc
  )
  
  summary_list[[i]] <- summary_df
}

final_summary <- bind_rows(summary_list)

final_summary %>% gt()
CHR mean_logFC median_logFC
1 0.03530399 0.002479699
2 0.07225343 0.001113815
3 0.04830308 0.014212023
4 0.07049886 0.027990760
5 0.07644282 0.011892850
6 0.01334637 -0.009420676
7 0.04393906 0.002151135
8 0.19836010 0.121154181
9 0.03652699 -0.010908634
10 0.07900521 -0.009744297
11 0.02607693 -0.019861725
12 0.09566353 0.033873698
13 0.12699998 0.020871230
14 0.05985903 0.020982288
15 0.07295819 0.038859233
16 0.02208292 -0.034985374
17 0.02113778 -0.019461390
18 0.13336673 0.018704282
19 0.09589128 0.035091883
20 0.06615149 0.002520803
21 0.05502288 0.008762923
22 0.04180247 -0.027330879
library(grid)
# chr 7
chr <- paste0("chr", 7)
centromere_start <- centromeres_summary %>%
  filter(chrom == chr) %>%
  pull(start)
centromere_end <- centromeres_summary %>%
  filter(chrom == chr) %>%
  pull(end)
gene.deldup.tmm <- log2_tmm_data_with_annotations %>%
  filter(gene_chr == as.character(7)) %>%
  filter(!(gene_start >= centromere_start & gene_start < centromere_end)) %>%
  dplyr::select(gene_id, gene_start, gene_end, samples$`Sample Name`) %>%
  pivot_longer(cols = -c(gene_id, gene_start, gene_end), names_to = "Sample", values_to = "Expression") %>%
  left_join(sample_to_group, by = c("Sample" = "Sample Name")) %>%  # Map samples to their groups
  group_by(gene_id, gene_start, gene_end, `Group Name`) %>%
  summarise(Average_Expression = mean(Expression, na.rm = TRUE), .groups = "drop") %>%
  arrange(gene_start, desc(Average_Expression))

gene.deldup.tmm.plot.relative.PRO <- gene.deldup.tmm %>%
  mutate(region = case_when(
    gene_start >= 0 & gene_start < centromere_start ~ "p",
    gene_start >= centromere_end ~ "q"
  )) %>%
  filter(`Group Name` %in% c('PRO', 'REV')) %>%  # Filter for PRO and REV
  tidyr::pivot_wider(names_from = `Group Name`, values_from = Average_Expression) %>%
  # Calculate relative expression as PRO average minus REV average
  mutate(relative_expression = PRO - REV) %>%
  group_by(region) %>%
  mutate(rolling_avg = custom_rollmean(relative_expression, k = 14)) %>%
  ungroup()
text_cen <- textGrob("Centromere", gp=gpar(fontsize=18, fontface="bold"))
plot_object <- ggplot(gene.deldup.tmm.plot.relative.PRO, aes(x = (gene_start+gene_end)/2, y = rolling_avg)) +
  theme(plot.margin = unit(c(1,1,2,1), "lines")) +
  geom_rect(aes(xmin = centromere_start, xmax = centromere_end, ymin = -Inf, ymax = Inf),
            fill = "grey", color = "grey", alpha = 0.6) +
  annotation_custom(text_cen,xmin=(centromere_start + centromere_end) / 2,xmax=(centromere_start + centromere_end) / 2,ymin=-1.35,ymax=-1.35) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black", size = 0.5, alpha = 0.8 ) +
  geom_point(size = 1.5, alpha = 0.9, color = "#727572") +
  labs(
    title = paste("Gene Expression on Chromosome 7 - Proband relative to Revertant"),
    x = "Chromosome Position (bp)",
    y = "Relative Expression Level (log2)",
    color =NULL
  ) +
  scale_y_continuous(limits = c(-1, 1.5), breaks = seq(-1.5, 1.5, by = 0.5)) +
  scale_x_continuous(expand = c(0, 0), 
                 limits = c(0, max(gene.deldup.tmm.plot.relative.PRO$gene_end) + 2e6),
                 breaks = seq(from = 0, to = max(gene.deldup.tmm.plot.relative.PRO$gene_end), by = 20000000),
                 labels = c("0", "20,000,000", "40,000,000", "60,000,000", "80,000,000", "100,000,000", "120,000,000", "140,000,000")) +
  expand_limits(x = 0, y = -1.5) +
  theme_classic() +
  coord_cartesian(clip = "off") +
  theme(
    text = element_text(family = "Arial", face = "bold"),
    plot.title = element_text(size = 28, face = "bold", hjust = 0.5, family = "Arial"),
    axis.title = element_text(size = 20, family = "Arial"),
    axis.text = element_text(size = 18, family = "Arial"),
    axis.title.x = element_text(margin = margin(t = 20)),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(plot_object)

# ggsave(
#   filename = paste("CHR7_gene_expression_plot_log2_PROvsREV.png"),
#   plot = plot_object,
#   width = 18,   # Adjust the width to make the canvas longer
#   height = 6,   # Adjust the height to control the aspect ratio
#   dpi = 300,     # Set the resolution to 300 dpi
#   bg = "transparent"
# )

# chr 9
chr <- paste0("chr", 9)
centromere_start <- centromeres_summary %>%
  filter(chrom == chr) %>%
  pull(start)
centromere_end <- centromeres_summary %>%
  filter(chrom == chr) %>%
  pull(end)
gene.deldup.tmm <- log2_tmm_data_with_annotations %>%
  filter(gene_chr == as.character(9)) %>%
  filter(!(gene_start >= centromere_start & gene_start < centromere_end)) %>%
  dplyr::select(gene_id, gene_start, gene_end, samples$`Sample Name`) %>%
  pivot_longer(cols = -c(gene_id, gene_start, gene_end), names_to = "Sample", values_to = "Expression") %>%
  left_join(sample_to_group, by = c("Sample" = "Sample Name")) %>%  # Map samples to their groups
  group_by(gene_id, gene_start, gene_end, `Group Name`) %>%
  summarise(Average_Expression = mean(Expression, na.rm = TRUE), .groups = "drop") %>%
  arrange(gene_start, desc(Average_Expression))
gene.deldup.tmm.plot.relative.PRO <- gene.deldup.tmm %>%
  mutate(region = case_when(
    gene_start >= 0 & gene_start < centromere_start ~ "p",
    gene_start >= centromere_end ~ "q"
  )) %>%
  filter(`Group Name` %in% c('PRO', 'REV'), region != "c") %>%  # Filter for PRO and REV
  tidyr::pivot_wider(names_from = `Group Name`, values_from = Average_Expression) %>%
  # Calculate relative expression as PRO average minus REV average
  mutate(relative_expression = PRO - REV) %>%
  group_by(region) %>%
  mutate(rolling_avg = custom_rollmean(relative_expression, k = 14)) %>%
  ungroup()
text_cen <- textGrob("Centromere", gp=gpar(fontsize=18, fontface="bold"))
plot_object <- ggplot(gene.deldup.tmm.plot.relative.PRO, aes(x = (gene_start+gene_end)/2, y = rolling_avg)) +
  theme(plot.margin = unit(c(1,1,2,1), "lines")) +
  geom_rect(aes(xmin = centromere_start, xmax = centromere_end, ymin = -Inf, ymax = Inf),
            fill = "grey", color = "grey", alpha = 0.6) +
  annotation_custom(text_cen,xmin=(centromere_start + centromere_end) / 2,xmax=(centromere_start + centromere_end) / 2,ymin=-1.35,ymax=-1.35) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black", size = 0.5, alpha = 0.8 ) +
  geom_point(size = 1.5, alpha = 0.9, color = "#727572") +
  labs(
    title = paste("Gene Expression on Chromosome 9 - Proband relative to Revertant"),
    x = "Chromosome Position (bp)",
    y = "Relative Expression Level (log2)",
    color =NULL
  ) +
  scale_y_continuous(limits = c(-1, 1.5), breaks = seq(-1.5, 1.5, by = 0.5)) +
  scale_x_continuous(expand = c(0, 0), 
                 limits = c(0, max(gene.deldup.tmm.plot.relative.PRO$gene_end) + 2e6),
                 breaks = seq(from = 0, to = max(gene.deldup.tmm.plot.relative.PRO$gene_end), by = 20000000),
                 labels = c("0", "20,000,000", "40,000,000", "60,000,000", "80,000,000", "100,000,000", "120,000,000")) +
  expand_limits(x = 0, y = -1.5) +
  theme_classic() +
  coord_cartesian(clip = "off") +
  theme(
    text = element_text(family = "Arial", face = "bold"),
    plot.title = element_text(size = 28, face = "bold", hjust = 0.5, family = "Arial"),
    axis.title = element_text(size = 20, family = "Arial"),
    axis.text = element_text(size = 18, family = "Arial"),
    axis.title.x = element_text(margin = margin(t = 20)),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )
print(plot_object)

# ggsave(
#   filename = paste("CHR9_gene_expression_plot_log2_PROvsREV.png"),
#   plot = plot_object,
#   width = 18,   # Adjust the width to make the canvas longer
#   height = 6,   # Adjust the height to control the aspect ratio
#   dpi = 300,     # Set the resolution to 300 dpi
#   bg = "transparent"
# )

GSE185192 DE analysis

new_counts <- read.csv("data/gene_counts_GSE185192.csv", comment.char = "#")

in_both <- new_counts$Geneid %in% counts$gene_id
all(in_both)
## [1] TRUE
sample_names <- c("IsoE_Rep1", "IsoE_Rep2", "IsoE_Rep3", "IsoT_Rep1", "IsoT_Rep2", "IsoT_Rep3")
GSE185192.samples <- data.frame(
  sample_name = sample_names,
  group = c("DS","DS","DS","TS","TS","TS")
)

GSE185192 <- new_counts %>%
  dplyr::select(Geneid, all_of(starts_with("aligned"))) %>%
  dplyr::rename(gene_id = Geneid,
                "IsoE_Rep1" = "aligned.SRR16242104.sorted.bam",
                "IsoE_Rep2" = "aligned.SRR16242105.sorted.bam", 
                "IsoE_Rep3" = "aligned.SRR16242106.sorted.bam", 
                "IsoT_Rep1" = "aligned.SRR16242107.sorted.bam",
                "IsoT_Rep2" = "aligned.SRR16242108.sorted.bam", 
                "IsoT_Rep3" = "aligned.SRR16242109.sorted.bam")
  

GSE185192.full <- GSE185192 %>%
  dplyr::left_join(annotation, by = "gene_id")

# get counts and annotation
GSE185192.annotation <- GSE185192.full %>% dplyr::select(-all_of(GSE185192.samples$sample_name))
GSE185192.counts <- GSE185192.full %>% dplyr::select(gene_id, all_of(GSE185192.samples$sample_name))

y.GSE185192 <- DGEList(counts = GSE185192.counts, samples = GSE185192.samples, group = GSE185192.samples$group)
y.GSE185192$genes <- GSE185192.annotation

keep <- filterByExpr(y.GSE185192, min.count = 30, min.total.count = 50, large.n = 10, min.prop = 0.75) # filter lowly expressed transcripts
table(keep)
## keep
## FALSE  TRUE 
## 42304 16431
y.GSE185192 <- y.GSE185192[keep, , keep.lib.sizes = FALSE]
y.GSE185192 <- normLibSizes(y.GSE185192) # TMM normalization

biotypes <- y.GSE185192$genes$gene_biotype

# Count the occurrences of each gene biotype
biotype_table <- table(biotypes)

# Calculate the fraction of each gene biotype
biotype_fraction <- prop.table(biotype_table)

biotype_df <- data.frame(
  Biotype = names(biotype_fraction),
  Count = as.numeric(biotype_table),
  Fraction = round(biotype_fraction, 3)
)

# calculate CPM
log2_cpm.GSE185192 <- cpm(y.GSE185192, log = TRUE, prior.count = 1, normalized.lib.sizes = T)
log2_tmm_data_with_annotations.GSE185192 <- cbind(y.GSE185192$genes, log2_cpm.GSE185192)
colnames(log2_cpm.GSE185192) <- GSE185192.samples$sample_name
# DE analysis
plotMDS(y.GSE185192)

samples.design <- model.matrix(~ 0 + group,data = y.GSE185192$samples) # design
colnames(samples.design) <- gsub("group", "", colnames(samples.design))
y.GSE185192 <- estimateDisp(y.GSE185192, samples.design, robust=TRUE)
print(y.GSE185192$common.dispersion)
## [1] 0.004212285
plotBCV(y.GSE185192)

fit <- glmQLFit(y.GSE185192, samples.design, robust=TRUE)
plotQLDisp(fit)

# make contrast
contrast <- makeContrasts(TSvsDS=TS-DS, levels=samples.design)
qlf.TSvsDS <- glmQLFTest(fit, contrast=contrast[,"TSvsDS"])
contr <- get_results("TS - DS", qlf.TSvsDS)
results_list <- list(contr)
plot_all_results(results_list)

table.results <- plot_thresholded_results(results_list)

### Figure 5

chr <- paste0("chr", 21)
centromere_start <- centromeres_summary %>%
  filter(chrom == chr) %>%
  pull(start)

centromere_end <- centromeres_summary %>%
  filter(chrom == chr) %>%
  pull(end)

gene.deldup.tmm <- log2_tmm_data_with_annotations %>%
  filter(gene_chr == as.character(21)) %>%
  filter(!(gene_start >= centromere_start & gene_start < centromere_end)) %>%
  dplyr::select(gene_id, gene_start, gene_end, samples$`Sample Name`) %>%
  pivot_longer(cols = -c(gene_id, gene_start, gene_end), names_to = "Sample", values_to = "Expression") %>%
  left_join(sample_to_group, by = c("Sample" = "Sample Name")) %>%  # Map samples to their groups
  group_by(gene_id, gene_start, gene_end, `Group Name`) %>%
  summarise(Average_Expression = mean(Expression, na.rm = TRUE), .groups = "drop") %>%
  arrange(gene_start, desc(Average_Expression))

gene.deldup.tmm.plot.relative.PRO <- gene.deldup.tmm %>%
  mutate(region = case_when(
    gene_start >= 0 & gene_start < centromere_start ~ "p",
    gene_start >= centromere_end ~ "q"
  )) %>%
  filter(`Group Name` %in% c('PRO', 'REV')) %>%  # Filter for PRO and REV
  tidyr::pivot_wider(names_from = `Group Name`, values_from = Average_Expression) %>%
  # Calculate relative expression as PRO average minus REV average
  mutate(relative_expression = PRO - REV) %>%
  group_by(region) %>%
  mutate(rolling_avg = custom_rollmean(relative_expression, k = 8)) %>%
  ungroup()

## TS21
gene.deldup.tmm.GSE185192 <- log2_tmm_data_with_annotations.GSE185192 %>%
  filter(gene_chr == as.character(21)) %>%
  filter(!(gene_start >= centromere_start & gene_start < centromere_end)) %>%
  dplyr::select(gene_id, gene_start, gene_end, GSE185192.samples$sample_name) %>%
  pivot_longer(cols = -c(gene_id, gene_start, gene_end), names_to = "Sample", values_to = "Expression") %>%
  left_join(GSE185192.samples, by = c("Sample" = "sample_name")) %>%  # Map samples to their groups
  group_by(gene_id, gene_start, gene_end, group) %>%
  summarise(Average_Expression = mean(Expression, na.rm = TRUE), .groups = "drop") %>%
  arrange(gene_start, desc(Average_Expression))


gene.deldup.tmm.plot.relative.TS.GSE185192 <- gene.deldup.tmm.GSE185192 %>%
  mutate(region = case_when(
    gene_start >= 0 & gene_start < centromere_start ~ "p",
    gene_start >= centromere_end ~ "q"
  )) %>%
  filter(group %in% c('TS', 'DS')) %>%  # Filter for PRO and REV
  tidyr::pivot_wider(names_from = group, values_from = Average_Expression) %>%
  # Calculate relative expression as PRO average minus REV average
  mutate(relative_expression = TS - DS) %>%
  group_by(region) %>%
  mutate(rolling_avg = custom_rollmean(relative_expression, k = 8)) %>%
  ungroup()

plot_combined_data <- inner_join(gene.deldup.tmm.plot.relative.PRO %>% dplyr::select(gene_id, gene_start, gene_end, rolling_avg), 
                                 gene.deldup.tmm.plot.relative.TS.GSE185192 %>% dplyr::select(gene_id, rolling_avg), by = "gene_id", suffix = c("_8p", "_21"))

plot_combined_data_long <- plot_combined_data %>%
  pivot_longer(cols = c(rolling_avg_8p, rolling_avg_21),
               names_to = "Dataset",
               names_prefix = "rolling_avg_",
               values_to = "rolling_avg") %>%
  mutate(Dataset = recode(Dataset,
                          "8p" = "8p Proband",
                          "21" = "Trisomy 21"))
  

text_cen <- textGrob("Centromere", gp=gpar(fontsize=18, fontface="bold"))

plot_object <- ggplot(plot_combined_data_long, aes(x = (gene_start+gene_end)/2, y = rolling_avg, color = Dataset)) +
  theme(plot.margin = unit(c(1,1,2,1), "lines")) +
  geom_rect(aes(xmin = centromere_start, xmax = centromere_end, ymin = -Inf, ymax = Inf), 
            fill = "grey", color = "grey", alpha = 0.6) +
  annotation_custom(text_cen,xmin=(centromere_start + centromere_end) / 2,xmax=(centromere_start + centromere_end) / 2,ymin=-1.35,ymax=-1.35) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black", size = 0.5, alpha = 0.8 ) +
  geom_point(size = 1.5, alpha = 0.9) + 
  labs(
    title = paste("Gene Expression on Chromosome 21"),
    x = "Chromosome Position (bp)",
    y = "Relative Expression Level (log2)",
    color =NULL
  ) +
  scale_y_continuous(limits = c(-1, 1.5), breaks = seq(-1.5, 1.5, by = 0.5)) +
  scale_x_continuous(expand = c(0, 0), 
               limits = c(0, max(plot_combined_data_long$gene_end) + 1e7),
               breaks = seq(from = 0, to = max(plot_combined_data_long$gene_end)+1e7, by = 10000000),
               labels = c("0", "10,000,000", "20,000,000", "30,000,000", "40,000,000", "50,000,000")) +
  scale_color_manual(values = c("8p Proband" = "steelblue", "Trisomy 21" = "firebrick")) +
  expand_limits(x = 0, y = -1.5) +
  theme_classic() +
  coord_cartesian(clip = "off") +
  theme(
    text = element_text(family = "Arial", face = "bold"),
    plot.title = element_text(size = 28, face = "bold", hjust = 0.5, family = "Arial"),
    axis.title = element_text(size = 20, family = "Arial"),                
    axis.text = element_text(size = 18, family = "Arial"),
    axis.title.x = element_text(margin = margin(t = 20)),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.text = element_text(size = 16, family = "Arial")
  ) 

print(plot_object)

# ggsave(
#   filename = paste("CHR21_gene_expression_plot_log2_TS21vsDS_PROvsREV.png"),
#   plot = plot_object,
#   width = 18,   # Adjust the width to make the canvas longer
#   height = 6,   # Adjust the height to control the aspect ratio
#   dpi = 300,     # Set the resolution to 300 dpi
#   bg = "transparent"
# )


mean_values_GSE185192 <- gene.deldup.tmm.plot.relative.TS.GSE185192 %>%
  summarize(DS_mean_log2 = mean(DS, na.rm = TRUE),
            TS_mean_log2 = mean(TS, na.rm = TRUE)) %>%
  mutate(abs_fold_change = 2^(TS_mean_log2 - DS_mean_log2))

mean_values_GSE185192
DS_mean_log2 TS_mean_log2 abs_fold_change
3.411458 4.034069 1.539659

filtering out 8p and chr21 genes

# PRO vs REV 8p
PRO_REV_8P_table <- contr.rev$top_tags$table %>%
  filter(gene_biotype == "protein_coding", gene_chr %in% c(1:22,"X","Y"))

# TS vs DS 21
TS_DS_21_table <- contr$top_tags$table %>%
  filter(gene_biotype == "protein_coding", gene_chr %in% c(1:22,"X","Y"))

# write_csv(contr$top_tags$table %>% dplyr::filter(FDR <= 0.05), "DEgenes_Trisomy21.csv")
dim(TS_DS_21_table)
## [1] 14038    15
dim(PRO_REV_8P_table)
## [1] 13212    15
region_counts21 <- TS_DS_21_table %>%
  mutate(region = gene_chr) %>%
  group_by(region) %>%
  summarise(
    total_genes = n(),
    de_count    = sum(FDR <= 0.05),
    .groups     = "drop"
  ) %>%
  mutate(fraction = de_count / total_genes) %>%
  mutate(region = factor(region, levels = c(as.character(1:22), "X", "Y")))

ggplot(region_counts21, aes(x = region, y = fraction)) +
  geom_col(width = 0.7) +
  geom_text(
    aes(label = scales::percent(fraction, accuracy = 0.1)),
    vjust = -0.5,
    size = 3
  ) +
  scale_y_continuous(
    labels = scales::percent,
    expand = expansion(mult = c(0, 0.1))
  ) +
  labs(
    title    = "DE Genes per Chromosome",
    subtitle = "TS vs DS (FDR <= 0.05, protein-coding)",
    x        = "Chromosome",
    y        = "Fraction DE"
  ) +
  theme_classic(base_family = "Arial") +
  theme(
    plot.title    = element_text(size = 20, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5),
    axis.title    = element_text(size = 14, face = "bold"),
    axis.text     = element_text(size = 14),
    axis.text.x   = element_text(hjust = 1),
    panel.grid    = element_blank()
  )

## 8p

region_counts8p <- PRO_REV_8P_table %>%
  mutate(region = gene_chr) %>%
  group_by(region) %>%
  summarise(
    total_genes = n(),
    de_count    = sum(FDR <= 0.05),
    .groups     = "drop"
  ) %>%
  mutate(fraction = de_count / total_genes) %>%
  mutate(region = factor(region, levels = c(as.character(1:22), "X", "Y")))

ggplot(region_counts8p, aes(x = region, y = fraction)) +
  geom_col(width = 0.7) +
  geom_text(
    aes(label = scales::percent(fraction, accuracy = 0.1)),
    vjust = -0.5,
    size = 3
  ) +
  scale_y_continuous(
    labels = scales::percent,
    expand = expansion(mult = c(0, 0.1))
  ) +
  labs(
    title    = "DE Genes per Chromosome - 8p",
    subtitle = "PRO vs REV (FDR <= 0.05, protein-coding)",
    x        = "Chromosome",
    y        = "Fraction DE"
  ) +
  theme_classic(base_family = "Arial") +
  theme(
    plot.title    = element_text(size = 20, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5),
    axis.title    = element_text(size = 14, face = "bold"),
    axis.text     = element_text(size = 14),
    axis.text.x   = element_text(hjust = 1),
    panel.grid    = element_blank()
  )

merged_FC <- inner_join(TS_DS_21_table %>%
                          dplyr::select(-c(gene_name, gene_chr, gene_start, gene_end,gene_strand,gene_length, gene_description, gene_biotype, tf_family)), 
                        PRO_REV_8P_table, by = "gene_id", suffix = c("_21", "_8p"))

# # export as background for enrichr
# write.table(
#   merged_FC$gene_name,
#   file      = "co-detected.txt",
#   quote     = FALSE,
#   row.names = FALSE,
#   col.names = FALSE
# )

Figure 5C

# Spearman
spearman_r <- cor(merged_FC$logFC_21,
                  merged_FC$logFC_8p,
                  use    = "complete.obs",
                  method = "spearman")
spearman_r
## [1] 0.2731122
spearman_test <- cor.test(merged_FC$logFC_21,
                          merged_FC$logFC_8p,
                          use    = "complete.obs",
                          method = "spearman")
## Warning in cor.test.default(merged_FC$logFC_21, merged_FC$logFC_8p, use =
## "complete.obs", : Cannot compute exact p-value with ties
spearman_test
## 
##  Spearman's rank correlation rho
## 
## data:  merged_FC$logFC_21 and merged_FC$logFC_8p
## S = 2.7373e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2731122

Figure 5D

total21.filtered <- TS_DS_21_table %>%
  dplyr::filter(
    gene_chr      != "21",
    !gene_id %in% combined_genes$ensembl_gene_id
  )

total8p.filtered   <- PRO_REV_8P_table %>% 
  dplyr::filter(
    gene_chr      != "21",
    !gene_id %in% combined_genes$ensembl_gene_id
  ) 


merged_FC_filtered <- merged_FC %>%
  dplyr::filter(
    gene_chr      != "21",
    !gene_id %in% combined_genes$ensembl_gene_id
  )

## protein coding gene
up_21 <- total21.filtered %>%
  dplyr::filter(FDR <= 0.05 & logFC > 0) %>%
  pull(gene_name)

up_8p <- total8p.filtered %>% 
  dplyr::filter(FDR <= 0.05, logFC > 0) %>%
  pull(gene_name)

# Negative logFC (downregulated)
down_21 <- total21.filtered %>%
  dplyr::filter(FDR <= 0.05, logFC < 0) %>%
  pull(gene_name)

down_8p <- total8p.filtered %>% 
  dplyr::filter(FDR <= 0.05, logFC < 0) %>%
  pull(gene_name)


# Upregulated (positive logFC)
upregulated_intersected_genes <- intersect(up_21, up_8p)

# Downregulated (negative logFC)
downregulated_intersected_genes <- intersect(down_21, down_8p)

cat("exclude 21 and 8p genes:")
## exclude 21 and 8p genes:
cat("Upregulated:", length(upregulated_intersected_genes), 
    "\nDownregulated:", length(downregulated_intersected_genes), "\n")
## Upregulated: 263 
## Downregulated: 269
# write.table(
#   upregulated_intersected_genes,
#   file      = "enrichr_upregulated_both_filtered.txt",
#   quote     = FALSE,
#   row.names = FALSE,
#   col.names = FALSE
# )
# 
# write.table(
#   downregulated_intersected_genes,
#   file      = "enrichr_downregulated_both_filtered.txt",
#   quote     = FALSE,
#   row.names = FALSE,
#   col.names = FALSE
# )
# 
# write.table(
#   merged_FC_filtered$gene_name,
#   file      = "co-detected_filtered.txt",
#   quote     = FALSE,
#   row.names = FALSE,
#   col.names = FALSE
# )

# hypergeometric test
N <- nrow(merged_FC_filtered)  # total number of genes in merged dataset

# Upregulated in 21 that are in merged_FC
K <- sum(up_21 %in% merged_FC_filtered$gene_name)

# Upregulated in 8p that are in merged_FC
n <- sum(up_8p %in% merged_FC_filtered$gene_name)

# Shared upregulated genes that are also in merged_FC
x <- sum(upregulated_intersected_genes %in% merged_FC_filtered$gene_name)

# Run hypergeometric test
p_value <- phyper(q = x - 1, m = K, n = N - K, k = n, lower.tail = FALSE)

# Hypergeometric test (greater tail = enrichment)
p_value <- phyper(q = x - 1, m = K, n = N - K, k = n, lower.tail = FALSE)

cat("N (total tested genes):", N, "\n",
    "K (DE up in dataset 21):", K, "\n",
    "n (DE up in dataset 8p):", n, "\n",
    "x (co-upregulated):", x, "\n")
## N (total tested genes): 12869 
##  K (DE up in dataset 21): 1623 
##  n (DE up in dataset 8p): 972 
##  x (co-upregulated): 263
# "N (total tested genes): 12869 
#  K (DE up in dataset 21): 1623 
#  n (DE up in dataset 8p): 972 
#  x (co-upregulated): 263 
# "
print(p_value)
## [1] 5.901803e-37
venn_data <- c(
  "Invdupdel(8p)" = n-x,
  "Trisomy 21" = K-x,
  "Invdupdel(8p)&Trisomy 21" = x
)

venn_euler <- euler(venn_data)
p <- plot(
  venn_euler,
  fills = c(
    met.brewer("Hiroshige", n = 10)[3],
    met.brewer("Hiroshige", n = 10)[7]
  ),
  labels = list(font = 1, cex = 0),
  alpha = 0.7
)

# png("euler_plot_up.png", width = 1500, height = 1200, res = 300)
# grid::grid.newpage()
# # Draw the plot
# grid::grid.draw(p)
# 
# # Close the file
# dev.off()


# hypergeometric test
N <- nrow(merged_FC_filtered)   # total genes (universe)
# Upregulated in 21 that are in merged_FC
K <- sum(down_21 %in% merged_FC_filtered$gene_name)
# Upregulated in 8p that are in merged_FC
n <- sum(down_8p%in% merged_FC_filtered$gene_name)
# Shared upregulated genes that are also in merged_FC
x <- sum(downregulated_intersected_genes %in% merged_FC_filtered$gene_name)

# Hypergeometric test (greater tail = enrichment)
p_value <- phyper(q = x - 1, m = K, n = N - K, k = n, lower.tail = FALSE)

cat("N (total tested genes):", N, "\n",
    "K (DE down in dataset 21):", K, "\n",
    "n (DE down in dataset 8p):", n, "\n",
    "x (co-downregulated):", x, "\n")
## N (total tested genes): 12869 
##  K (DE down in dataset 21): 1783 
##  n (DE down in dataset 8p): 1153 
##  x (co-downregulated): 269
# "N (total tested genes): 12869 
#  K (DE down in dataset 21): 1783 
#  n (DE down in dataset 8p): 1153 
#  x (co-downregulated): 269 
# "
print(p_value)
## [1] 5.388227e-20
venn_data <- c(
  "Invdupdel(8p)" = n-x,
  "Trisomy 21" = K-x,
  "Invdupdel(8p)&Trisomy 21" = x
)

venn_euler <- euler(venn_data)
p <- plot(
  venn_euler,
  fills = c(
    met.brewer("Hiroshige", n = 10)[3],
    met.brewer("Hiroshige", n = 10)[7]
  ),
  labels = list(font = 1, cex = 0),
  alpha = 0.7
)

# png("euler_plot_down.png", width = 1500, height = 1200, res = 300)
# grid::grid.newpage()
# # Draw the plot
# grid::grid.draw(p)
# # Close the file
# dev.off()
upregulated_8p_not_21 <- setdiff(up_8p, up_21)[setdiff(up_8p, up_21) %in% merged_FC_filtered$gene_name]

cat("Upregulated in 8p but not in 21:", length(upregulated_8p_not_21), " (FDR <= 0.05) \n")
## Upregulated in 8p but not in 21: 709  (FDR <= 0.05)
downregulated_8p_not_21 <- setdiff(down_8p, down_21)[setdiff(down_8p, down_21) %in% merged_FC_filtered$gene_name]

cat("Downregulated in 8p but not in 21:", length(downregulated_8p_not_21), " (FDR <= 0.05) \n")
## Downregulated in 8p but not in 21: 884  (FDR <= 0.05)
# write.table(
#   upregulated_8p_not_21,
#   file      = "enrichr_upregulated_8p_not_21_filtered.txt",
#   quote     = FALSE,
#   row.names = FALSE,
#   col.names = FALSE
# )
# 
# write.table(
#   downregulated_8p_not_21,
#   file      = "enrichr_downregulated_8p_not_21_filtered.txt",
#   quote     = FALSE,
#   row.names = FALSE,
#   col.names = FALSE
# )


upregulated_21_not_8p <- setdiff(up_21, up_8p)[setdiff(up_21, up_8p) %in% merged_FC_filtered$gene_name]
cat("Upregulated in 21 but not in 8p:", length(upregulated_21_not_8p), " (FDR <= 0.05) \n")
## Upregulated in 21 but not in 8p: 1360  (FDR <= 0.05)
downregulated_21_not_8p <- setdiff(down_21, down_8p)[setdiff(down_21, down_8p) %in% merged_FC_filtered$gene_name]
cat("Downregulated in 21 but not in 8p:", length(downregulated_21_not_8p), " (FDR <= 0.05) \n")
## Downregulated in 21 but not in 8p: 1514  (FDR <= 0.05)
# write.table(
#   upregulated_21_not_8p,
#   file      = "enrichr_upregulated_21_not_8p_filtered.txt",
#   quote     = FALSE,
#   row.names = FALSE,
#   col.names = FALSE
# )
# 
# write.table(
#   downregulated_21_not_8p,
#   file      = "enrichr_downregulated_21_not_8p_filtered.txt",
#   quote     = FALSE,
#   row.names = FALSE,
#   col.names = FALSE
# )
overlapping_genes <- intersect(protein_coding_genes_del$ensembl_gene_id, protein_coding_genes_dup$ensembl_gene_id)
if (length(overlapping_genes) > 0) {
  cat("Genes present in both regions:\n")
  print(overlapping_genes)
} else {
  cat("No genes are present in both the deletion and duplication regions.\n")
}
## No genes are present in both the deletion and duplication regions.

Session Info and Citations

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggthemes_5.1.0         eulerr_7.0.2           MetBrewer_0.2.0       
##  [4] gt_0.11.1              gtsummary_2.0.4        ggrepel_0.9.6         
##  [7] scales_1.3.0           biomaRt_2.60.1         msigdbr_7.5.1         
## [10] enrichplot_1.24.4      org.Hs.eg.db_3.19.1    AnnotationDbi_1.66.0  
## [13] IRanges_2.38.1         S4Vectors_0.42.1       Biobase_2.64.0        
## [16] BiocGenerics_0.50.0    clusterProfiler_4.12.6 edgeR_4.2.2           
## [19] limma_3.60.6           lubridate_1.9.3        forcats_1.0.0         
## [22] stringr_1.5.1          dplyr_1.1.4            purrr_1.0.2           
## [25] readr_2.1.5            tidyr_1.3.1            tibble_3.2.1          
## [28] ggplot2_3.5.1          tidyverse_2.0.0        readxl_1.4.3          
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.17.1       jsonlite_1.8.9         
##   [4] magrittr_2.0.3          farver_2.1.2            rmarkdown_2.28         
##   [7] fs_1.6.5                zlibbioc_1.50.0         vctrs_0.6.5            
##  [10] memoise_2.0.1           ggtree_3.12.0           progress_1.2.3         
##  [13] htmltools_0.5.8.1       curl_5.2.3              cellranger_1.1.0       
##  [16] gridGraphics_0.5-1      sass_0.4.9              bslib_0.8.0            
##  [19] plyr_1.8.9              httr2_1.0.5             cachem_1.1.0           
##  [22] igraph_2.1.1            lifecycle_1.0.4         pkgconfig_2.0.3        
##  [25] Matrix_1.7-1            R6_2.5.1                fastmap_1.2.0          
##  [28] gson_0.1.0              GenomeInfoDbData_1.2.12 digest_0.6.37          
##  [31] aplot_0.2.3             colorspace_2.1-1        patchwork_1.3.0        
##  [34] RSQLite_2.3.7           labeling_0.4.3          filelock_1.0.3         
##  [37] fansi_1.0.6             timechange_0.3.0        httr_1.4.7             
##  [40] polyclip_1.10-7         compiler_4.4.1          bit64_4.5.2            
##  [43] withr_3.0.2             BiocParallel_1.38.0     viridis_0.6.5          
##  [46] DBI_1.2.3               highr_0.11              ggforce_0.4.2          
##  [49] R.utils_2.12.3          MASS_7.3-61             rappdirs_0.3.3         
##  [52] tools_4.4.1             ape_5.8                 scatterpie_0.2.4       
##  [55] R.oo_1.26.0             glue_1.8.0              nlme_3.1-166           
##  [58] GOSemSim_2.30.2         polylabelr_0.3.0        shadowtext_0.1.4       
##  [61] reshape2_1.4.4          fgsea_1.30.0            generics_0.1.3         
##  [64] gtable_0.3.6            tzdb_0.4.0              R.methodsS3_1.8.2      
##  [67] data.table_1.16.2       hms_1.1.3               xml2_1.3.6             
##  [70] tidygraph_1.3.1         utf8_1.2.4              XVector_0.44.0         
##  [73] pillar_1.9.0            vroom_1.6.5             babelgene_22.9         
##  [76] yulab.utils_0.1.7       splines_4.4.1           tweenr_2.0.3           
##  [79] BiocFileCache_2.12.0    treeio_1.28.0           lattice_0.22-6         
##  [82] bit_4.5.0               tidyselect_1.2.1        GO.db_3.19.1           
##  [85] locfit_1.5-9.10         Biostrings_2.72.1       knitr_1.48             
##  [88] gridExtra_2.3           xfun_0.48               graphlayouts_1.2.0     
##  [91] statmod_1.5.0           stringi_1.8.4           UCSC.utils_1.0.0       
##  [94] lazyeval_0.2.2          ggfun_0.1.7             yaml_2.3.10            
##  [97] evaluate_1.0.1          codetools_0.2-20        ggraph_2.2.1           
## [100] qvalue_2.36.0           ggplotify_0.1.2         cli_3.6.3              
## [103] munsell_0.5.1           jquerylib_0.1.4         Rcpp_1.0.13            
## [106] GenomeInfoDb_1.40.1     dbplyr_2.5.0            png_0.1-8              
## [109] parallel_4.4.1          blob_1.2.4              prettyunits_1.2.0      
## [112] DOSE_3.30.5             viridisLite_0.4.2       tidytree_0.4.6         
## [115] crayon_1.5.3            rlang_1.1.4             cowplot_1.1.3          
## [118] fastmatch_1.1-4         KEGGREST_1.44.1
packages_in_use <- c( names( sessionInfo()$otherPkgs ) )
the_citations_list <- lapply( X=packages_in_use, FUN=citation)
the_citations_list
## [[1]]
## To cite package 'ggthemes' in publications use:
## 
##   Arnold J (2024). _ggthemes: Extra Themes, Scales and Geoms for
##   'ggplot2'_. R package version 5.1.0,
##   <https://CRAN.R-project.org/package=ggthemes>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {ggthemes: Extra Themes, Scales and Geoms for 'ggplot2'},
##     author = {Jeffrey B. Arnold},
##     year = {2024},
##     note = {R package version 5.1.0},
##     url = {https://CRAN.R-project.org/package=ggthemes},
##   }
## 
## [[2]]
## To cite use of the eulerr R package in publications, please use:
## 
##   Larsson J (2024). _eulerr: Area-Proportional Euler and Venn Diagrams
##   with Ellipses_. R package version 7.0.2,
##   <https://CRAN.R-project.org/package=eulerr>.
## 
## To cite the methodology behind eulerr in publications, please use:
## 
##   Larsson J, Gustafsson P (2018). "A Case Study in Fitting
##   Area-Proportional Euler Diagrams with Ellipses Using eulerr." In
##   _Proceedings of International Workshop on Set Visualization and
##   Reasoning_, volume 2116, 84-91.
##   <https://cran.r-project.org/package=eulerr>.
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
## 
## [[3]]
## To cite package 'MetBrewer' in publications use:
## 
##   Mills BR (2022). _MetBrewer: Color Palettes Inspired by Works at the
##   Metropolitan Museum of Art_. R package version 0.2.0,
##   <https://CRAN.R-project.org/package=MetBrewer>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {MetBrewer: Color Palettes Inspired by Works at the Metropolitan Museum of
## Art},
##     author = {Blake Robert Mills},
##     year = {2022},
##     note = {R package version 0.2.0},
##     url = {https://CRAN.R-project.org/package=MetBrewer},
##   }
## 
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
## 
## [[4]]
## To cite package 'gt' in publications use:
## 
##   Iannone R, Cheng J, Schloerke B, Hughes E, Lauer A, Seo J, Brevoort
##   K, Roy O (2024). _gt: Easily Create Presentation-Ready Display
##   Tables_. R package version 0.11.1,
##   <https://CRAN.R-project.org/package=gt>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {gt: Easily Create Presentation-Ready Display Tables},
##     author = {Richard Iannone and Joe Cheng and Barret Schloerke and Ellis Hughes and Alexandra Lauer and JooYoung Seo and Ken Brevoort and Olivier Roy},
##     year = {2024},
##     note = {R package version 0.11.1},
##     url = {https://CRAN.R-project.org/package=gt},
##   }
## 
## [[5]]
## To cite gtsummary in publications use:
## 
##   Sjoberg DD, Whiting K, Curry M, Lavery JA, Larmarange J. Reproducible
##   summary tables with the gtsummary package. The R Journal
##   2021;13:570–80. https://doi.org/10.32614/RJ-2021-053.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{gtsummary,
##     author = {Daniel D. Sjoberg and Karissa Whiting and Michael Curry and Jessica A. Lavery and Joseph Larmarange},
##     title = {Reproducible Summary Tables with the gtsummary Package},
##     journal = {{The R Journal}},
##     year = {2021},
##     url = {https://doi.org/10.32614/RJ-2021-053},
##     doi = {10.32614/RJ-2021-053},
##     volume = {13},
##     issue = {1},
##     pages = {570-580},
##   }
## 
## [[6]]
## To cite package 'ggrepel' in publications use:
## 
##   Slowikowski K (2024). _ggrepel: Automatically Position
##   Non-Overlapping Text Labels with 'ggplot2'_. R package version 0.9.6,
##   <https://CRAN.R-project.org/package=ggrepel>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {ggrepel: Automatically Position Non-Overlapping Text Labels with
## 'ggplot2'},
##     author = {Kamil Slowikowski},
##     year = {2024},
##     note = {R package version 0.9.6},
##     url = {https://CRAN.R-project.org/package=ggrepel},
##   }
## 
## [[7]]
## To cite package 'scales' in publications use:
## 
##   Wickham H, Pedersen T, Seidel D (2023). _scales: Scale Functions for
##   Visualization_. R package version 1.3.0,
##   <https://CRAN.R-project.org/package=scales>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {scales: Scale Functions for Visualization},
##     author = {Hadley Wickham and Thomas Lin Pedersen and Dana Seidel},
##     year = {2023},
##     note = {R package version 1.3.0},
##     url = {https://CRAN.R-project.org/package=scales},
##   }
## 
## [[8]]
## To cite the biomaRt package in publications use:
## 
##   Mapping identifiers for the integration of genomic datasets with the
##   R/Bioconductor package biomaRt. Steffen Durinck, Paul T. Spellman,
##   Ewan Birney and Wolfgang Huber, Nature Protocols 4, 1184-1191 (2009).
## 
##   BioMart and Bioconductor: a powerful link between biological
##   databases and microarray data analysis. Steffen Durinck, Yves Moreau,
##   Arek Kasprzyk, Sean Davis, Bart De Moor, Alvis Brazma and Wolfgang
##   Huber, Bioinformatics 21, 3439-3440 (2005).
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
## 
## [[9]]
## To cite package 'msigdbr' in publications use:
## 
##   Dolgalev I (2022). _msigdbr: MSigDB Gene Sets for Multiple Organisms
##   in a Tidy Data Format_. R package version 7.5.1,
##   <https://CRAN.R-project.org/package=msigdbr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {msigdbr: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format},
##     author = {Igor Dolgalev},
##     year = {2022},
##     note = {R package version 7.5.1},
##     url = {https://CRAN.R-project.org/package=msigdbr},
##   }
## 
## [[10]]
## To cite package 'enrichplot' in publications use:
## 
##   Yu G (2024). _enrichplot: Visualization of Functional Enrichment
##   Result_. doi:10.18129/B9.bioc.enrichplot
##   <https://doi.org/10.18129/B9.bioc.enrichplot>, R package version
##   1.24.4, <https://bioconductor.org/packages/enrichplot>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {enrichplot: Visualization of Functional Enrichment Result},
##     author = {Guangchuang Yu},
##     year = {2024},
##     note = {R package version 1.24.4},
##     url = {https://bioconductor.org/packages/enrichplot},
##     doi = {10.18129/B9.bioc.enrichplot},
##   }
## 
## [[11]]
## To cite package 'org.Hs.eg.db' in publications use:
## 
##   Carlson M (2024). _org.Hs.eg.db: Genome wide annotation for Human_. R
##   package version 3.19.1.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {org.Hs.eg.db: Genome wide annotation for Human},
##     author = {Marc Carlson},
##     year = {2024},
##     note = {R package version 3.19.1},
##   }
## 
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
## 
## [[12]]
## To cite package 'AnnotationDbi' in publications use:
## 
##   Pagès H, Carlson M, Falcon S, Li N (2024). _AnnotationDbi:
##   Manipulation of SQLite-based annotations in Bioconductor_.
##   doi:10.18129/B9.bioc.AnnotationDbi
##   <https://doi.org/10.18129/B9.bioc.AnnotationDbi>, R package version
##   1.66.0, <https://bioconductor.org/packages/AnnotationDbi>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {AnnotationDbi: Manipulation of SQLite-based annotations in Bioconductor},
##     author = {Hervé Pagès and Marc Carlson and Seth Falcon and Nianhua Li},
##     year = {2024},
##     note = {R package version 1.66.0},
##     url = {https://bioconductor.org/packages/AnnotationDbi},
##     doi = {10.18129/B9.bioc.AnnotationDbi},
##   }
## 
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
## 
## [[13]]
## To cite package 'IRanges' in publications use:
## 
##   Lawrence M, Huber W, Pag\`es H, Aboyoun P, Carlson M, et al. (2013)
##   Software for Computing and Annotating Genomic Ranges. PLoS Comput
##   Biol 9(8): e1003118. doi:10.1371/journal.pcbi.1003118
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Software for Computing and Annotating Genomic Ranges},
##     author = {Michael Lawrence and Wolfgang Huber and Herv\'e Pag\`es and Patrick Aboyoun and Marc Carlson and Robert Gentleman and Martin Morgan and Vincent Carey},
##     year = {2013},
##     journal = {{PLoS} Computational Biology},
##     volume = {9},
##     issue = {8},
##     doi = {10.1371/journal.pcbi.1003118},
##     url = {http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118},
##   }
## 
## [[14]]
## To cite package 'S4Vectors' in publications use:
## 
##   Pagès H, Lawrence M, Aboyoun P (2024). _S4Vectors: Foundation of
##   vector-like and list-like containers in Bioconductor_.
##   doi:10.18129/B9.bioc.S4Vectors
##   <https://doi.org/10.18129/B9.bioc.S4Vectors>, R package version
##   0.42.1, <https://bioconductor.org/packages/S4Vectors>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {S4Vectors: Foundation of vector-like and list-like containers in
## Bioconductor},
##     author = {Hervé Pagès and Michael Lawrence and Patrick Aboyoun},
##     year = {2024},
##     note = {R package version 0.42.1},
##     url = {https://bioconductor.org/packages/S4Vectors},
##     doi = {10.18129/B9.bioc.S4Vectors},
##   }
## 
## [[15]]
## To cite package 'Biobase' in publications use:
## 
##   Orchestrating high-throughput genomic analysis with Bioconductor. W.
##   Huber, V.J. Carey, R. Gentleman, ..., M. Morgan Nature Methods,
##   2015:12, 115.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     author = {W. Huber and V. J. Carey and R. Gentleman and S. Anders and M. Carlson and B. S. Carvalho and H. C. Bravo and S. Davis and L. Gatto and T. Girke and R. Gottardo and F. Hahne and K. D. Hansen and R. A. Irizarry and M. Lawrence and M. I. Love and J. MacDonald and V. Obenchain and A. K. {Ole's} and H. {Pag`es} and A. Reyes and P. Shannon and G. K. Smyth and D. Tenenbaum and L. Waldron and M. Morgan},
##     title = {{O}rchestrating high-throughput genomic analysis with {B}ioconductor},
##     journal = {Nature Methods},
##     year = {2015},
##     volume = {12},
##     number = {2},
##     pages = {115--121},
##     url = {http://www.nature.com/nmeth/journal/v12/n2/full/nmeth.3252.html},
##   }
## 
## [[16]]
## To cite package 'BiocGenerics' in publications use:
## 
##   Orchestrating high-throughput genomic analysis with Bioconductor. W.
##   Huber, V.J. Carey, R. Gentleman, ..., M. Morgan Nature Methods,
##   2015:12, 115.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     author = {{Huber} and {W.} and {Carey} and V. J. and {Gentleman} and {R.} and {Anders} and {S.} and {Carlson} and {M.} and {Carvalho} and B. S. and {Bravo} and H. C. and {Davis} and {S.} and {Gatto} and {L.} and {Girke} and {T.} and {Gottardo} and {R.} and {Hahne} and {F.} and {Hansen} and K. D. and {Irizarry} and R. A. and {Lawrence} and {M.} and {Love} and M. I. and {MacDonald} and {J.} and {Obenchain} and {V.} and {{Ole's}} and A. K. and {{Pag`es}} and {H.} and {Reyes} and {A.} and {Shannon} and {P.} and {Smyth} and G. K. and {Tenenbaum} and {D.} and {Waldron} and {L.} and {Morgan} and {M.}},
##     title = {{O}rchestrating high-throughput genomic analysis with {B}ioconductor},
##     journal = {Nature Methods},
##     year = {2015},
##     volume = {12},
##     number = {2},
##     pages = {115--121},
##     url = {http://www.nature.com/nmeth/journal/v12/n2/full/nmeth.3252.html},
##   }
## 
## [[17]]
## Please cite S. Xu (2024) for using clusterProfiler. In addition, please
## cite G. Yu (2010) when using GOSemSim, G. Yu (2015) when using DOSE and
## G. Yu (2015) when using ChIPseeker.
## 
##   S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R
##   Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
##   multiomics data. Nature Protocols. 2024,
##   doi:10.1038/s41596-024-01020-z
## 
##   T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L
##   Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
##   enrichment tool for interpreting omics data. The Innovation. 2021,
##   2(3):100141
## 
##   Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
##   clusterProfiler: an R package for comparing biological themes among
##   gene clusters. OMICS: A Journal of Integrative Biology 2012,
##   16(5):284-287
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
## 
## [[18]]
## See Section 1.2 in the User's Guide for more detail about how to cite
## the different edgeR pipelines.
## 
##   Chen Y, Chen L, Lun ATL, Baldoni PL, Smyth GK (2024). edgeR 4.0:
##   powerful differential analysis of sequencing data with expanded
##   functionality and improved support for small counts and larger
##   datasets. bioRxiv doi: 10.1101/2024.01.21.576131
## 
##   Chen Y, Lun ATL, Smyth GK (2016). From reads to genes to pathways:
##   differential expression analysis of RNA-Seq experiments using
##   Rsubread and the edgeR quasi-likelihood pipeline. F1000Research 5,
##   1438
## 
##   McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression
##   analysis of multifactor RNA-Seq experiments with respect to
##   biological variation. Nucleic Acids Research 40(10), 4288-4297
## 
##   Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
##   package for differential expression analysis of digital gene
##   expression data. Bioinformatics 26(1), 139-140
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
## 
## [[19]]
## To cite package 'limma' in publications use:
## 
##   Ritchie, M.E., Phipson, B., Wu, D., Hu, Y., Law, C.W., Shi, W., and
##   Smyth, G.K. (2015). limma powers differential expression analyses for
##   RNA-sequencing and microarray studies. Nucleic Acids Research 43(7),
##   e47.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     author = {Matthew E Ritchie and Belinda Phipson and Di Wu and Yifang Hu and Charity W Law and Wei Shi and Gordon K Smyth},
##     title = {{limma} powers differential expression analyses for {RNA}-sequencing and microarray studies},
##     journal = {Nucleic Acids Research},
##     year = {2015},
##     volume = {43},
##     number = {7},
##     pages = {e47},
##     doi = {10.1093/nar/gkv007},
##   }
## 
## [[20]]
## To cite lubridate in publications use:
## 
##   Garrett Grolemund, Hadley Wickham (2011). Dates and Times Made Easy
##   with lubridate. Journal of Statistical Software, 40(3), 1-25. URL
##   https://www.jstatsoft.org/v40/i03/.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Dates and Times Made Easy with {lubridate}},
##     author = {Garrett Grolemund and Hadley Wickham},
##     journal = {Journal of Statistical Software},
##     year = {2011},
##     volume = {40},
##     number = {3},
##     pages = {1--25},
##     url = {https://www.jstatsoft.org/v40/i03/},
##   }
## 
## [[21]]
## To cite package 'forcats' in publications use:
## 
##   Wickham H (2023). _forcats: Tools for Working with Categorical
##   Variables (Factors)_. R package version 1.0.0,
##   <https://CRAN.R-project.org/package=forcats>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {forcats: Tools for Working with Categorical Variables (Factors)},
##     author = {Hadley Wickham},
##     year = {2023},
##     note = {R package version 1.0.0},
##     url = {https://CRAN.R-project.org/package=forcats},
##   }
## 
## [[22]]
## To cite package 'stringr' in publications use:
## 
##   Wickham H (2023). _stringr: Simple, Consistent Wrappers for Common
##   String Operations_. R package version 1.5.1,
##   <https://CRAN.R-project.org/package=stringr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {stringr: Simple, Consistent Wrappers for Common String Operations},
##     author = {Hadley Wickham},
##     year = {2023},
##     note = {R package version 1.5.1},
##     url = {https://CRAN.R-project.org/package=stringr},
##   }
## 
## [[23]]
## To cite package 'dplyr' in publications use:
## 
##   Wickham H, François R, Henry L, Müller K, Vaughan D (2023). _dplyr: A
##   Grammar of Data Manipulation_. R package version 1.1.4,
##   <https://CRAN.R-project.org/package=dplyr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {dplyr: A Grammar of Data Manipulation},
##     author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller and Davis Vaughan},
##     year = {2023},
##     note = {R package version 1.1.4},
##     url = {https://CRAN.R-project.org/package=dplyr},
##   }
## 
## [[24]]
## To cite package 'purrr' in publications use:
## 
##   Wickham H, Henry L (2023). _purrr: Functional Programming Tools_. R
##   package version 1.0.2, <https://CRAN.R-project.org/package=purrr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {purrr: Functional Programming Tools},
##     author = {Hadley Wickham and Lionel Henry},
##     year = {2023},
##     note = {R package version 1.0.2},
##     url = {https://CRAN.R-project.org/package=purrr},
##   }
## 
## [[25]]
## To cite package 'readr' in publications use:
## 
##   Wickham H, Hester J, Bryan J (2024). _readr: Read Rectangular Text
##   Data_. R package version 2.1.5,
##   <https://CRAN.R-project.org/package=readr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {readr: Read Rectangular Text Data},
##     author = {Hadley Wickham and Jim Hester and Jennifer Bryan},
##     year = {2024},
##     note = {R package version 2.1.5},
##     url = {https://CRAN.R-project.org/package=readr},
##   }
## 
## [[26]]
## To cite package 'tidyr' in publications use:
## 
##   Wickham H, Vaughan D, Girlich M (2024). _tidyr: Tidy Messy Data_. R
##   package version 1.3.1, <https://CRAN.R-project.org/package=tidyr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {tidyr: Tidy Messy Data},
##     author = {Hadley Wickham and Davis Vaughan and Maximilian Girlich},
##     year = {2024},
##     note = {R package version 1.3.1},
##     url = {https://CRAN.R-project.org/package=tidyr},
##   }
## 
## [[27]]
## To cite package 'tibble' in publications use:
## 
##   Müller K, Wickham H (2023). _tibble: Simple Data Frames_. R package
##   version 3.2.1, <https://CRAN.R-project.org/package=tibble>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {tibble: Simple Data Frames},
##     author = {Kirill Müller and Hadley Wickham},
##     year = {2023},
##     note = {R package version 3.2.1},
##     url = {https://CRAN.R-project.org/package=tibble},
##   }
## 
## [[28]]
## To cite ggplot2 in publications, please use
## 
##   H. Wickham. ggplot2: Elegant Graphics for Data Analysis.
##   Springer-Verlag New York, 2016.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Book{,
##     author = {Hadley Wickham},
##     title = {ggplot2: Elegant Graphics for Data Analysis},
##     publisher = {Springer-Verlag New York},
##     year = {2016},
##     isbn = {978-3-319-24277-4},
##     url = {https://ggplot2.tidyverse.org},
##   }
## 
## [[29]]
## To cite package 'tidyverse' in publications use:
## 
##   Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R,
##   Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller
##   E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V,
##   Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to
##   the tidyverse." _Journal of Open Source Software_, *4*(43), 1686.
##   doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Welcome to the {tidyverse}},
##     author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
##     year = {2019},
##     journal = {Journal of Open Source Software},
##     volume = {4},
##     number = {43},
##     pages = {1686},
##     doi = {10.21105/joss.01686},
##   }
## 
## [[30]]
## To cite package 'readxl' in publications use:
## 
##   Wickham H, Bryan J (2023). _readxl: Read Excel Files_. R package
##   version 1.4.3, <https://CRAN.R-project.org/package=readxl>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {readxl: Read Excel Files},
##     author = {Hadley Wickham and Jennifer Bryan},
##     year = {2023},
##     note = {R package version 1.4.3},
##     url = {https://CRAN.R-project.org/package=readxl},
##   }