Load Libraries

library("readxl")
library("tidyverse") 
library("edgeR")        
library("clusterProfiler")
library("org.Hs.eg.db") 
library("enrichplot")   
library("msigdbr")
library("biomaRt")
library(MetBrewer)

Analysis on Altered Region

Ensembl GRCh38

ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
# region of interest
chromosome_of_interest <- "8"
deletion_start <- 0
deletion_end <- 7247573
duplication_start <- 11828865
duplication_end <- 40361770

attributes <- c(
  "ensembl_gene_id",
  "external_gene_name",
  "chromosome_name",
  "start_position",
  "end_position",
  "strand",
  "gene_biotype"
)

# deletion region
filters_del <- c("chromosome_name", "start", "end")
values_del <- list(chromosome_of_interest, deletion_start, deletion_end)

genes_in_deletion <- getBM(
  attributes = attributes,
  filters = filters_del,
  values = values_del,
  mart = ensembl
)

protein_coding_genes_del <- genes_in_deletion[genes_in_deletion$gene_biotype == "protein_coding", ]
number_of_genes_del <- length(unique(protein_coding_genes_del$ensembl_gene_id))
cat("Number of protein-coding genes in the deletion region:", number_of_genes_del, "\n")
## Number of protein-coding genes in the deletion region: 23
# duplication region
filters_dup <- c("chromosome_name", "start", "end")
values_dup <- list(chromosome_of_interest, duplication_start, duplication_end)

genes_in_duplication <- getBM(
  attributes = attributes,
  filters = filters_dup,
  values = values_dup,
  mart = ensembl
)

protein_coding_genes_dup <- genes_in_duplication[genes_in_duplication$gene_biotype == "protein_coding", ]
number_of_genes_dup <- length(unique(protein_coding_genes_dup$ensembl_gene_id))
cat("Number of protein-coding genes in the duplication region:", number_of_genes_dup, "\n")
## Number of protein-coding genes in the duplication region: 172
protein_coding_genes_del$region <- "Deletion"
protein_coding_genes_dup$region <- "Duplication"
combined_genes <- rbind(protein_coding_genes_del, protein_coding_genes_dup)
combined_genes
ensembl_gene_id external_gene_name chromosome_name start_position end_position strand gene_biotype region
6 ENSG00000176269 OR4F21 8 166086 167024 -1 protein_coding Deletion
10 ENSG00000172748 ZNF596 8 232137 264703 1 protein_coding Deletion
21 ENSG00000147364 FBXO25 8 406428 477967 1 protein_coding Deletion
24 ENSG00000180190 TDRP 8 489803 545781 -1 protein_coding Deletion
26 ENSG00000104714 ERICH1 8 614746 738106 -1 protein_coding Deletion
31 ENSG00000198010 DLGAP2 8 737628 1708476 1 protein_coding Deletion
48 ENSG00000182372 CLN8 8 1755778 1801711 1 protein_coding Deletion
50 ENSG00000283239 KBTBD11-OT1 8 1763888 1958627 1 protein_coding Deletion
54 ENSG00000104728 ARHGEF10 8 1823926 1958641 1 protein_coding Deletion
60 ENSG00000176595 KBTBD11 8 1973677 2006936 1 protein_coding Deletion
64 ENSG00000036448 MYOM2 8 2045046 2165552 1 protein_coding Deletion
85 ENSG00000183117 CSMD1 8 2935353 4994972 -1 protein_coding Deletion
121 ENSG00000147316 MCPH1 8 6406592 6648508 1 protein_coding Deletion
123 ENSG00000091879 ANGPT2 8 6499632 6563409 -1 protein_coding Deletion
130 ENSG00000155189 AGPAT5 8 6708642 6761503 1 protein_coding Deletion
137 ENSG00000275591 XKR5 8 6808517 6835524 -1 protein_coding Deletion
141 ENSG00000164825 DEFB1 8 6870592 6877936 -1 protein_coding Deletion
144 ENSG00000164822 DEFA6 8 6924697 6926076 -1 protein_coding Deletion
147 ENSG00000164821 DEFA4 8 6935820 6938306 -1 protein_coding Deletion
152 ENSG00000206047 DEFA1 8 6977649 6980092 -1 protein_coding Deletion
155 ENSG00000240247 DEFA1B 8 6996766 6999198 -1 protein_coding Deletion
158 ENSG00000239839 DEFA3 8 7015869 7018297 -1 protein_coding Deletion
161 ENSG00000164816 DEFA5 8 7055304 7056739 -1 protein_coding Deletion
1 ENSG00000079459 FDFT1 8 11795573 11839395 1 protein_coding Duplication
2 ENSG00000164733 CTSB 8 11842524 11869533 -1 protein_coding Duplication
101 ENSG00000205884 DEFB136 8 11973937 11974599 -1 protein_coding Duplication
11 ENSG00000205883 DEFB135 8 11982256 11984590 1 protein_coding Duplication
12 ENSG00000205882 DEFB134 8 11993174 12000752 -1 protein_coding Duplication
20 ENSG00000233050 DEFB130B 8 12064389 12071747 -1 protein_coding Duplication
22 ENSG00000215343 ZNF705D 8 12089338 12115516 1 protein_coding Duplication
262 ENSG00000226430 USP17L7 8 12132417 12134099 -1 protein_coding Duplication
27 ENSG00000223443 USP17L2 8 12136435 12138849 -1 protein_coding Duplication
28 ENSG00000254866 DEFB109D 8 12150888 12158033 -1 protein_coding Duplication
311 ENSG00000186523 FAM86B1 8 12182096 12194133 -1 protein_coding Duplication
35 ENSG00000232948 DEFB130A 8 12310962 12318316 -1 protein_coding Duplication
47 ENSG00000145002 FAM86B2 8 12424411 12436406 -1 protein_coding Duplication
65 ENSG00000154359 LONRF1 8 12721906 12756073 -1 protein_coding Duplication
73 ENSG00000250305 TRMT9B 8 12945642 13031503 1 protein_coding Duplication
78 ENSG00000164741 DLC1 8 13083361 13604610 -1 protein_coding Duplication
84 ENSG00000164743 C8orf48 8 13566869 13568288 1 protein_coding Duplication
95 ENSG00000185053 SGCZ 8 14084845 15238431 -1 protein_coding Duplication
110 ENSG00000104723 TUSC3 8 15417215 15766649 1 protein_coding Duplication
117 ENSG00000038945 MSR1 8 16107878 16567490 -1 protein_coding Duplication
125 ENSG00000078579 FGF20 8 16992181 17002345 -1 protein_coding Duplication
127 ENSG00000155970 MICU3 8 17027238 17125880 1 protein_coding Duplication
131 ENSG00000104219 ZDHHC2 8 17156482 17224799 1 protein_coding Duplication
132 ENSG00000198791 CNOT7 8 17224966 17246878 -1 protein_coding Duplication
134 ENSG00000155975 VPS37A 8 17246931 17302427 1 protein_coding Duplication
136 ENSG00000003987 MTMR7 8 17296794 17413528 -1 protein_coding Duplication
1411 ENSG00000003989 SLC7A2 8 17497088 17570573 1 protein_coding Duplication
146 ENSG00000104213 PDGFRL 8 17576433 17644071 1 protein_coding Duplication
149 ENSG00000129422 MTUS1 8 17643795 17801094 -1 protein_coding Duplication
156 ENSG00000104760 FGL1 8 17864380 17910365 -1 protein_coding Duplication
159 ENSG00000078674 PCM1 8 17922842 18029948 1 protein_coding Duplication
1611 ENSG00000104763 ASAH1 8 18055992 18084998 -1 protein_coding Duplication
164 ENSG00000171428 NAT1 8 18170477 18223689 1 protein_coding Duplication
171 ENSG00000156006 NAT2 8 18391282 18401218 1 protein_coding Duplication
172 ENSG00000156011 PSD3 8 18527303 19084730 -1 protein_coding Duplication
190 ENSG00000104611 SH2D4A 8 19313693 19396218 1 protein_coding Duplication
192 ENSG00000147408 CSGALNACT1 8 19404161 19758029 -1 protein_coding Duplication
198 ENSG00000104613 INTS10 8 19817391 19852083 1 protein_coding Duplication
199 ENSG00000175445 LPL 8 19901717 19967259 1 protein_coding Duplication
204 ENSG00000036565 SLC18A1 8 20144855 20183206 -1 protein_coding Duplication
206 ENSG00000147416 ATP6V1B2 8 20197381 20230399 1 protein_coding Duplication
209 ENSG00000061337 LZTS1 8 20246165 20303963 -1 protein_coding Duplication
235 ENSG00000168546 GFRA2 8 21690398 21812357 -1 protein_coding Duplication
237 ENSG00000147443 DOK2 8 21908873 21913690 -1 protein_coding Duplication
238 ENSG00000130227 XPO7 8 21919662 22006585 1 protein_coding Duplication
241 ENSG00000158806 NPM2 8 22024125 22036897 1 protein_coding Duplication
242 ENSG00000158815 FGF17 8 22042398 22048809 1 protein_coding Duplication
243 ENSG00000158856 DMTN 8 22048995 22082527 1 protein_coding Duplication
245 ENSG00000158863 FHIP2B 8 22089150 22104911 1 protein_coding Duplication
246 ENSG00000275074 NUDT18 8 22105748 22109419 -1 protein_coding Duplication
247 ENSG00000168453 HR 8 22114419 22133384 -1 protein_coding Duplication
248 ENSG00000288677 HRURF 8 22130458 22131010 -1 protein_coding Duplication
249 ENSG00000168476 REEP4 8 22138020 22141951 -1 protein_coding Duplication
251 ENSG00000168481 LGI3 8 22146830 22157084 -1 protein_coding Duplication
252 ENSG00000168484 SFTPC 8 22156913 22164479 1 protein_coding Duplication
253 ENSG00000168487 BMP1 8 22165140 22212326 1 protein_coding Duplication
256 ENSG00000168490 PHYHIP 8 22219703 22232101 -1 protein_coding Duplication
259 ENSG00000168495 POLR3D 8 22245133 22254601 1 protein_coding Duplication
261 ENSG00000197181 PIWIL2 8 22275316 22357568 1 protein_coding Duplication
263 ENSG00000104635 SLC39A14 8 22367278 22434129 1 protein_coding Duplication
266 ENSG00000120910 PPP3CC 8 22440819 22541142 1 protein_coding Duplication
269 ENSG00000120896 SORBS3 8 22544986 22575788 1 protein_coding Duplication
273 ENSG00000120913 PDLIM2 8 22578279 22598025 1 protein_coding Duplication
274 ENSG00000248235 8 22589274 22602084 1 protein_coding Duplication
275 ENSG00000241852 C8orf58 8 22599599 22604150 1 protein_coding Duplication
277 ENSG00000158941 CCAR2 8 22604757 22620964 1 protein_coding Duplication
279 ENSG00000147439 BIN3 8 22620418 22669148 -1 protein_coding Duplication
283 ENSG00000179388 EGR3 8 22687659 22693480 -1 protein_coding Duplication
288 ENSG00000134020 PEBP4 8 22713251 23000000 -1 protein_coding Duplication
297 ENSG00000008853 RHOBTB2 8 22987417 23020509 1 protein_coding Duplication
298 ENSG00000120889 TNFRSF10B 8 23020133 23069031 -1 protein_coding Duplication
302 ENSG00000284956 8 23084403 23115536 1 protein_coding Duplication
303 ENSG00000173535 TNFRSF10C 8 23102921 23117445 1 protein_coding Duplication
304 ENSG00000173530 TNFRSF10D 8 23135588 23164027 -1 protein_coding Duplication
307 ENSG00000104689 TNFRSF10A 8 23190452 23225102 -1 protein_coding Duplication
312 ENSG00000147457 CHMP7 8 23243637 23262000 1 protein_coding Duplication
314 ENSG00000104679 R3HCC1 8 23270120 23296279 1 protein_coding Duplication
316 ENSG00000134013 LOXL2 8 23296897 23425328 -1 protein_coding Duplication
318 ENSG00000197217 ENTPD4 8 23385783 23457695 -1 protein_coding Duplication
326 ENSG00000147454 SLC25A37 8 23528956 23575463 1 protein_coding Duplication
330 ENSG00000167034 NKX3-1 8 23678697 23682938 -1 protein_coding Duplication
331 ENSG00000180053 NKX2-6 8 23701740 23706756 -1 protein_coding Duplication
338 ENSG00000159167 STC1 8 23841929 23854806 -1 protein_coding Duplication
343 ENSG00000042980 ADAM28 8 24294069 24359014 1 protein_coding Duplication
344 ENSG00000134028 ADAMDEC1 8 24384285 24406013 1 protein_coding Duplication
345 ENSG00000069206 ADAM7 8 24440930 24526970 1 protein_coding Duplication
349 ENSG00000104722 NEFM 8 24913758 24919098 1 protein_coding Duplication
351 ENSG00000277586 NEFL 8 24950955 24956721 -1 protein_coding Duplication
362 ENSG00000147459 DOCK5 8 25184689 25418082 1 protein_coding Duplication
366 ENSG00000147437 GNRH1 8 25419258 25424654 -1 protein_coding Duplication
368 ENSG00000104756 KCTD9 8 25427847 25458476 -1 protein_coding Duplication
369 ENSG00000184661 CDCA2 8 25459199 25507911 1 protein_coding Duplication
377 ENSG00000221818 EBF2 8 25841725 26045413 -1 protein_coding Duplication
386 ENSG00000221914 PPP2R2A 8 26291508 26372680 1 protein_coding Duplication
389 ENSG00000104765 BNIP3L 8 26383054 26505636 1 protein_coding Duplication
394 ENSG00000240694 PNMA2 8 26504701 26514092 -1 protein_coding Duplication
395 ENSG00000092964 DPYSL2 8 26514031 26658178 1 protein_coding Duplication
399 ENSG00000120907 ADRA1A 8 26748150 26867278 -1 protein_coding Duplication
411 ENSG00000015592 STMN4 8 27235308 27258420 -1 protein_coding Duplication
412 ENSG00000104228 TRIM35 8 27284886 27311272 -1 protein_coding Duplication
413 ENSG00000120899 PTK2B 8 27311482 27459391 1 protein_coding Duplication
415 ENSG00000120903 CHRNA2 8 27459756 27479883 -1 protein_coding Duplication
416 ENSG00000120915 EPHX2 8 27490781 27548615 1 protein_coding Duplication
420 ENSG00000120885 CLU 8 27596917 27614700 -1 protein_coding Duplication
422 ENSG00000168077 SCARA3 8 27633868 27676776 1 protein_coding Duplication
430 ENSG00000147419 CCDC25 8 27733316 27772653 -1 protein_coding Duplication
431 ENSG00000171320 ESCO2 8 27771949 27812640 1 protein_coding Duplication
433 ENSG00000168078 PBK 8 27809624 27838082 -1 protein_coding Duplication
436 ENSG00000168079 SCARA5 8 27869883 27992673 -1 protein_coding Duplication
441 ENSG00000189233 NUGGC 8 28021964 28083936 -1 protein_coding Duplication
442 ENSG00000134014 ELP3 8 28089673 28191156 1 protein_coding Duplication
449 ENSG00000168081 PNOC 8 28316986 28343355 1 protein_coding Duplication
450 ENSG00000186918 ZNF395 8 28345590 28402701 -1 protein_coding Duplication
451 ENSG00000214050 FBXO16 8 28348287 28490278 -1 protein_coding Duplication
457 ENSG00000104290 FZD3 8 28494205 28574267 1 protein_coding Duplication
461 ENSG00000012232 EXTL3 8 28600469 28756561 1 protein_coding Duplication
465 ENSG00000104299 INTS9 8 28767661 28890242 -1 protein_coding Duplication
468 ENSG00000147421 HMBOX1 8 28890395 29064764 1 protein_coding Duplication
475 ENSG00000197892 KIF13B 8 29067278 29263124 -1 protein_coding Duplication
481 ENSG00000120875 DUSP4 8 29333064 29350684 -1 protein_coding Duplication
512 ENSG00000133872 SARAF 8 30063003 30083208 -1 protein_coding Duplication
515 ENSG00000104660 LEPROTL1 8 30095408 30177208 1 protein_coding Duplication
517 ENSG00000177669 MBOAT4 8 30131671 30144665 -1 protein_coding Duplication
519 ENSG00000104671 DCTN6 8 30156319 30183639 1 protein_coding Duplication
535 ENSG00000157110 RBPMS 8 30384511 30572256 1 protein_coding Duplication
539 ENSG00000197265 GTF2E2 8 30578318 30658236 -1 protein_coding Duplication
541 ENSG00000253457 SMIM18 8 30638580 30646064 1 protein_coding Duplication
543 ENSG00000104687 GSR 8 30678066 30727846 -1 protein_coding Duplication
544 ENSG00000104691 UBXN8 8 30729131 30767006 1 protein_coding Duplication
546 ENSG00000104695 PPP2CB 8 30774457 30814314 -1 protein_coding Duplication
547 ENSG00000133863 TEX15 8 30831544 30913008 -1 protein_coding Duplication
551 ENSG00000172733 PURG 8 30995802 31033715 -1 protein_coding Duplication
552 ENSG00000165392 WRN 8 31033788 31176138 1 protein_coding Duplication
563 ENSG00000157168 NRG1 8 31639222 32855666 1 protein_coding Duplication
571 ENSG00000286131 8 32647202 32647390 1 protein_coding Duplication
582 ENSG00000172728 FUT10 8 33370824 33473146 -1 protein_coding Duplication
585 ENSG00000129696 TTI2 8 33473386 33513185 -1 protein_coding Duplication
586 ENSG00000198042 MAK16 8 33485182 33501262 1 protein_coding Duplication
592 ENSG00000133874 RNF122 8 33547754 33567128 -1 protein_coding Duplication
594 ENSG00000133878 DUSP26 8 33591330 33600023 -1 protein_coding Duplication
615 ENSG00000156687 UNC5D 8 35235475 35796550 1 protein_coding Duplication
633 ENSG00000215262 KCNU1 8 36784324 36936125 1 protein_coding Duplication
659 ENSG00000183779 ZNF703 8 37695782 37700019 1 protein_coding Duplication
663 ENSG00000147475 ERLIN2 8 37736601 37758422 1 protein_coding Duplication
666 ENSG00000147471 PLPBP 8 37762595 37779768 1 protein_coding Duplication
668 ENSG00000020181 ADGRA2 8 37784191 37844896 1 protein_coding Duplication
670 ENSG00000104221 BRF2 8 37843268 37849861 -1 protein_coding Duplication
671 ENSG00000156675 RAB11FIP1 8 37858618 37899497 -1 protein_coding Duplication
674 ENSG00000169154 GOT1L1 8 37934281 37940124 -1 protein_coding Duplication
675 ENSG00000285880 8 37934340 37965953 -1 protein_coding Duplication
677 ENSG00000188778 ADRB3 8 37962990 37966599 -1 protein_coding Duplication
679 ENSG00000187840 EIF4EBP1 8 38030534 38060365 1 protein_coding Duplication
684 ENSG00000129691 ASH2L 8 38105493 38144076 1 protein_coding Duplication
687 ENSG00000147465 STAR 8 38142700 38150992 -1 protein_coding Duplication
689 ENSG00000175324 LSM1 8 38163335 38176730 -1 protein_coding Duplication
691 ENSG00000156735 BAG4 8 38176533 38213301 1 protein_coding Duplication
694 ENSG00000085788 DDHD2 8 38225218 38275558 1 protein_coding Duplication
695 ENSG00000147535 PLPP5 8 38263130 38269243 -1 protein_coding Duplication
696 ENSG00000147548 NSD3 8 38269704 38382272 -1 protein_coding Duplication
701 ENSG00000165046 LETM2 8 38386207 38409527 1 protein_coding Duplication
702 ENSG00000077782 FGFR1 8 38400215 38468834 -1 protein_coding Duplication
718 ENSG00000147526 TACC1 8 38728186 38853028 1 protein_coding Duplication
723 ENSG00000169499 PLEKHA2 8 38901235 38973912 1 protein_coding Duplication
725 ENSG00000169495 HTRA4 8 38974228 38988663 1 protein_coding Duplication
726 ENSG00000169490 TM2D2 8 38988808 38996824 -1 protein_coding Duplication
727 ENSG00000168615 ADAM9 8 38996754 39105445 1 protein_coding Duplication
729 ENSG00000197140 ADAM32 8 39106990 39284917 1 protein_coding Duplication
739 ENSG00000168619 ADAM18 8 39584489 39730065 1 protein_coding Duplication
740 ENSG00000104755 ADAM2 8 39743735 39838227 -1 protein_coding Duplication
743 ENSG00000131203 IDO1 8 39902275 39928790 1 protein_coding Duplication
747 ENSG00000188676 IDO2 8 39934614 40016392 1 protein_coding Duplication
753 ENSG00000176907 TCIM 8 40153482 40155310 1 protein_coding Duplication
overlapping_genes <- intersect(protein_coding_genes_del$ensembl_gene_id, protein_coding_genes_dup$ensembl_gene_id)
if (length(overlapping_genes) > 0) {
  cat("Genes present in both regions:\n")
  print(overlapping_genes)
} else {
  cat("No genes are present in both the deletion and duplication regions.\n")
}
## No genes are present in both the deletion and duplication regions.

Differential Expression analysis

prepare input data

counts <- read.csv("gene_count.csv")
samples <- read_xlsx("Sample List.xlsx")
raw.counts <- counts %>% dplyr::select(gene_id, samples$`Sample Name`)
annotation <- counts %>% dplyr::select(-all_of(samples$`Sample Name`))

define functions

fc_threshold = 1.0
get_results <- function(contrast, qlf, p_value = 0.05, n_top = Inf) {
  # identify significant DE genes
  is.de <- decideTests(qlf, p.value = p_value)
  summary_de <- summary(is.de)
  top_tags <- topTags(qlf, n = n_top)
  upregulated <- sum(is.de == 1)
  downregulated <- sum(is.de == -1)
  no_change <- sum(is.de == 0)
  
  # apply a threshold for DE genes
  thresholded_results <- top_tags$table[
  top_tags$table$FDR <= 0.05 & abs(top_tags$table$logFC) >= fc_threshold, ]
  
  return(list(
    contrast = contrast,
    qlf = qlf,
    is_de = is.de,
    summary_de = summary_de,
    top_tags = top_tags,
    upregulated = upregulated,
    downregulated = downregulated,
    no_change = no_change,
    thresholded_results = thresholded_results
  ))
}

# Function to plot multiple group comparisons
plot_all_results <- function(results_list) {
  plot_data <- do.call(rbind, lapply(results_list, function(result) {
    data.frame(
      Category = c("Upregulated", "Downregulated", "No Change"),
      Count = c(result$upregulated, result$downregulated, result$no_change),
      Comparison = result$contrast
    )
  }))
  
  plot <- ggplot(plot_data, aes(x = Comparison, y = Count, fill = Category)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_text(aes(label = Count), 
              position = position_dodge(width = 0.9), 
              vjust = -0.5) +
    labs(title = "Differential Expression Across Comparisons",
         subtitle = "padj <= 0.05",
         x = "Comparison",
         y = "Number of Genes") +
    theme_minimal() +
    scale_fill_manual(values = c("Upregulated" = "lightblue", "Downregulated" = "pink", "No Change" = "gray"))

  print(plot)
}

plot_thresholded_results <- function(results_list) {
  plot_data <- lapply(results_list, function(result) {
    thresholded_results <- result$thresholded_results
    # Count the number of upregulated genes (logFC >= 1.0)
    upregulated_genes <- thresholded_results[thresholded_results$logFC >= fc_threshold & thresholded_results$FDR <= 0.05, ]
    num_upregulated <- nrow(upregulated_genes)
    # Count the number of downregulated genes (logFC <= -1.0)
    downregulated_genes <- thresholded_results[thresholded_results$logFC <= -fc_threshold & thresholded_results$FDR <= 0.05, ]
    num_downregulated <- nrow(downregulated_genes)
    
    all <- num_downregulated + num_upregulated
   
    data.frame(
      Category = c("Upregulated", "Downregulated"),
      Count = c(num_upregulated, num_downregulated),
      Comparison = result$contrast
    )
  })

  plot_data <- do.call(rbind, plot_data)
  plot <- ggplot(plot_data, aes(x = Comparison, y = Count, fill = Category)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_text(aes(label = Count), 
              position = position_dodge(width = 0.9), 
              vjust = -0.5) +  # Adds numbers on top of bars
    labs(title = "Differential Expression Across Comparisons",
         subtitle = "padj <= 0.05 and |logFC| >= 1",
         x = "Comparison",
         y = "Number of Genes") +
    theme_minimal() +
    scale_fill_manual(values = c("Upregulated" = "skyblue", "Downregulated" = "maroon"))
  
  print(plot)
  return(plot_data)
}

custom_rollmean <- function(x, k = 40) {
  n <- length(x)
  half_window <- k / 2
  
  rolling_avg <- rep(NA, n)  # Initialize the rolling average vector
  
  # First half (start) of the series: fewer previous values
  for (i in 1:half_window) {
    rolling_avg[i] <- mean(x[1:(i + half_window)], na.rm = TRUE)
  }
  
  # Middle part: use full window size
  for (i in (half_window + 1):(n - half_window)) {
    rolling_avg[i] <- mean(x[(i - half_window):(i + half_window)], na.rm = TRUE)
  }
  
  # Last half (end) of the series: fewer subsequent values
  for (i in (n - half_window + 1):n) {
    rolling_avg[i] <- mean(x[(i - half_window):n], na.rm = TRUE)
  }
  
  return(rolling_avg)
}

analysis

samples.parents <- samples %>%
  mutate(`Group Name` = ifelse(`Group Name` %in% c("MOM", "DAD"), "Parents", `Group Name`))
print(samples.parents)
## # A tibble: 12 × 7
##    `Sample Name` `Cell line`    Concentration (ng/uL…¹ `Volume (uL)` `A260/A280`
##    <chr>         <chr>                           <dbl>         <dbl>       <dbl>
##  1 SL_1          31.3 (Sample …                  1194.            30        2.09
##  2 SL_2          31.3 (Sample …                  1068.            30        2.08
##  3 SL_3          31.3 (Sample …                  1097.            30        2.08
##  4 SL_4          JE01 214 (Sam…                   523.            30        2.00
##  5 SL_5          JE01 214 (Sam…                   762.            30        2.04
##  6 SL_6          JE01 214 (Sam…                   410.            30        1.96
##  7 SL_7          255-1 (p13)                      244.            30        2.08
##  8 SL_8          255-4 (p13)                      606.            30        2.09
##  9 SL_9          255-3 (p13)                      495.            30        2.06
## 10 SL_10         294-1                           1128.            30        2.07
## 11 SL_11         294-2                            750.            30        2.07
## 12 SL_12         294-3                            382.            30        2.09
## # ℹ abbreviated name: ¹​`Concentration (ng/uL)`
## # ℹ 2 more variables: `A260/A230` <dbl>, `Group Name` <chr>
y.parents <- DGEList(counts = raw.counts, samples = samples.parents, group = samples.parents$`Group Name`)
y.parents$genes <- annotation

keep <- filterByExpr(y.parents, min.count = 30, min.total.count = 50, large.n = 10, min.prop = 0.75) # filter lowly expressed transcripts
table(keep)
## keep
## FALSE  TRUE 
## 43987 14748
y.parents <- y.parents[keep, , keep.lib.sizes=FALSE]
y.parents <- normLibSizes(y.parents) # TMM normalization
# calculate CPM 
log2_cpm <- cpm(y.parents, log = TRUE, prior.count = 1, normalized.lib.sizes = T)
cpm_counts<-cpm(y.parents, normalized.lib.sizes = T)
tmm_data_with_annotations <- cbind(y.parents$genes, cpm_counts)
log2_tmm_data_with_annotations <- cbind(y.parents$genes, log2_cpm)

colnames(log2_cpm) <- samples$`Cell line`
gene_annotations <- data.frame(GeneID = y.parents$genes$gene_id, Description = y.parents$genes$gene_description)
gsea.input <- cbind(gene_annotations, log2_cpm)
# write.table(gsea.input, file="log2_TMM_normalized_counts.txt", 
#             sep="\t", quote=FALSE, row.names=FALSE)
colnames(cpm_counts) <- samples$`Cell line`
gene_annotations <- data.frame(GeneID = y.parents$genes$gene_id, Description = y.parents$genes$gene_description)
gsea.input <- cbind(gene_annotations, cpm_counts)
# write.table(gsea.input, file="TMM_normalized_counts.txt", 
#             sep="\t", quote=FALSE, row.names=FALSE)

# DE analysis
plotMDS(y.parents)        

samples.design.parents <- model.matrix(~ 0 + group,data = y.parents$samples) # design
colnames(samples.design.parents) <- gsub("group", "", colnames(samples.design.parents))
y.parents <- estimateDisp(y.parents, samples.design.parents, robust=TRUE)
print(y.parents$common.dispersion)
## [1] 0.03257095
plotBCV(y.parents)

fit.parents <- glmQLFit(y.parents, samples.design.parents, robust=TRUE)
plotQLDisp(fit.parents)

# make contrast
parent.contrast <- makeContrasts(PROvsParents=PRO-Parents, levels=samples.design.parents)
rev.contrast <- makeContrasts(PROvsREV=PRO-REV, levels=samples.design.parents)
parent.rev.contrast <- makeContrasts(ParentsvsREV=Parents-REV, levels=samples.design.parents)

qlf.PROvsParents <- glmQLFTest(fit.parents, contrast=parent.contrast[,"PROvsParents"])
qlf.PROvsREV <- glmQLFTest(fit.parents, contrast=rev.contrast[,"PROvsREV"])
qlf.ParentsvsREV <- glmQLFTest(fit.parents, contrast=parent.rev.contrast[,"ParentsvsREV"])

contr.parents <- get_results("PRO - Parents", qlf.PROvsParents)
contr.rev <- get_results("PRO - REV", qlf.PROvsREV)
contr.parent.rev <- get_results("Parents - REV", qlf.ParentsvsREV)

parents_results_list <- list(contr.parents, contr.rev,contr.parent.rev)
plot_all_results(parents_results_list)

table.results <- plot_thresholded_results(parents_results_list)

# write_csv(contr.rev$top_tags$table, "PROvsREV_DE_results.csv")

GSEA

import data

ranked_data_PRO_versus_REV <- read.table("ranked_gene_list_PRO_versus_REV_1728681059442.tsv", 
                          header = TRUE, 
                          sep = "\t", 
                          quote = "",
                          fill = TRUE,
                          comment.char = "")

# import gene set
H_t2g <- msigdbr(species = "Homo sapiens", category = "H") %>% dplyr::select(gs_name, entrez_gene, gs_description)
C5_t2g <- msigdbr(species = "Homo sapiens", category = "C5") %>% dplyr::select(gs_name, entrez_gene, gs_description)
C2_t2g <- msigdbr(species = "Homo sapiens", category = "C2") %>% dplyr::select(gs_name, entrez_gene, gs_description)

analysis

pvalue_cutoff <- 0.05
n_permutations <- 100000
ranked_genes <- setNames(ranked_data_PRO_versus_REV$SCORE, ranked_data_PRO_versus_REV$NAME)
# Map to Entrez IDs
entrez_ids <- mapIds(org.Hs.eg.db, 
                     keys = names(ranked_genes), 
                     column = "ENTREZID", 
                     keytype = "SYMBOL", 
                     multiVals = "first")

# Remove unmapped genes 
ranked_genes_entrez <- ranked_genes[!is.na(entrez_ids)]
names(ranked_genes_entrez) <- entrez_ids[!is.na(entrez_ids)]
  
# Resolve ties by adding a small random number
ranked_genes_entrez <- ranked_genes_entrez + runif(length(ranked_genes_entrez), min = -1e-5, max = 1e-5)

ranked_genes_entrez <- sort(ranked_genes_entrez, decreasing = TRUE)
head(ranked_genes_entrez)
##    643224      7503    647946    222663 101928913    653333 
##  5.000008  5.000007  5.000004  5.000002  5.000002  5.000001
set.seed(1234)
GSEA_H <- GSEA(geneList = ranked_genes_entrez,
                    TERM2GENE = H_t2g,       
                    pvalueCutoff = 1.0,   
                    eps = 0,                     
                    nPermSimple = n_permutations, 
                    minGSSize = 15,                 
                    maxGSSize = 500,              
                    verbose = TRUE)

GSEA_C5 <- GSEA(geneList = ranked_genes_entrez,
                    TERM2GENE = C5_t2g,
                    pvalueCutoff = 1.0,
                    eps = 0,
                    nPermSimple = n_permutations,
                    minGSSize = 15,
                    maxGSSize = 500,
                    verbose = TRUE)

GSEA_C2 <- GSEA(geneList = ranked_genes_entrez,  
                    TERM2GENE = C2_t2g,          
                    pvalueCutoff = 1.0,   
                    eps = 0,                         
                    nPermSimple = n_permutations,   
                    minGSSize = 15,                
                    maxGSSize = 500,              
                    verbose = TRUE)

Manuscript Figures

Figure 4

sample_to_group <- samples %>%
  dplyr::select(`Sample Name`, `Group Name`)

library(MetBrewer)
library(ggthemes)
library(grid)
annotation_colors <- met.brewer("Cassatt1", n = 8, type = "discrete")  # Select red and blue shades
del_shade <- annotation_colors[5]
dup_shade <- annotation_colors[4]
centr_shade <- annotation_colors[8]

Figure 4A

gene.deldup.tmm <- log2_tmm_data_with_annotations %>%
  filter(gene_chr == "8") %>%
  dplyr::select(gene_id, gene_start, gene_end, samples$`Sample Name`) %>%
  pivot_longer(cols = -c(gene_id, gene_start, gene_end), names_to = "Sample", values_to = "Expression") %>%
  left_join(sample_to_group, by = c("Sample" = "Sample Name")) %>%  # Map samples to their groups
  group_by(gene_id, gene_start, gene_end, `Group Name`) %>%
  summarise(Average_Expression = mean(Expression, na.rm = TRUE), .groups = "drop") %>%
  arrange(gene_start, desc(Average_Expression))


gene.deldup.tmm.plot.relative.PRO <- gene.deldup.tmm %>%
  mutate(region = case_when(
    gene_start >= 0 & gene_start < deletion_end ~ "deletion",
    gene_start >= deletion_end & gene_start < duplication_start ~ "Region between invdupdel",
    gene_start >= duplication_start & gene_start < duplication_end ~ "duplication",
    gene_start >= duplication_end & gene_start < 44033745  ~ "Region from dup to centromere",
    gene_start >= 45877265 ~ "8q"
  )) %>%
  filter(`Group Name` %in% c('PRO', 'REV')) %>%  # Filter for PRO and REV
  tidyr::pivot_wider(names_from = `Group Name`, values_from = Average_Expression) %>%
  # Calculate relative expression as PRO average minus REV average
  mutate(relative_expression = PRO - REV) %>%
  group_by(region) %>%
  mutate(rolling_avg = custom_rollmean(relative_expression, k = 14)) %>%
  ungroup()


text_del <- textGrob("Deletion", gp=gpar(fontsize=18, fontface="bold"))
text_dup <- textGrob("Duplication", gp=gpar(fontsize=18, fontface="bold"))
text_cen <- textGrob("Centromere", gp=gpar(fontsize=18, fontface="bold"))

plot_object <- ggplot(gene.deldup.tmm.plot.relative.PRO, aes(x = (gene_start+gene_end)/2, y = rolling_avg)) +
  theme(plot.margin = unit(c(1,1,2,1), "lines")) +
  annotate("rect", xmin = deletion_start, xmax = deletion_end, ymin = -Inf, ymax = Inf, 
           fill = del_shade, alpha = 0.8) +  # Red box for deletion region
  annotate("rect", xmin = duplication_start, xmax = duplication_end, ymin = -Inf, ymax = Inf, 
           fill = dup_shade, alpha = 0.8) +  # Blue box for duplication region
  geom_rect(aes(xmin = 44033745, xmax = 45877265, ymin = -Inf, ymax = Inf), 
            fill = "grey", color = "grey", alpha = 0.6) +
  annotation_custom(text_del,xmin=(deletion_start + deletion_end) / 2, xmax=(deletion_start + deletion_end) / 2,ymin=-1.95,ymax=-1.95) + 
  annotation_custom(text_dup,xmin=(duplication_start + duplication_end) / 2, xmax=(duplication_start + duplication_end) / 2,ymin=-1.95,ymax=-1.95) +
  annotation_custom(text_cen,xmin=(44033745 + 45877265) / 2,xmax=(44033745 + 45877265) / 2,ymin=-1.95,ymax=-1.95) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black", size = 0.5, alpha = 0.8 ) +
  geom_point(size = 1.5, alpha = 0.9, color = "#727572") + 
  labs(
    title = "Gene Expression on Chromosome 8 - Proband relative to Revertant",
    x = "Chromosome Position (bp)",
    y = "Relative Expression Level (log2)",
    color =NULL
  ) +
  scale_y_continuous(limits = c(-1.5, 1.5), breaks = seq(-1.5, 1.5, by = 0.5)) +
  scale_x_continuous(expand = c(0, 0), 
                     limits = c(0, max(gene.deldup.tmm.plot.relative.PRO$gene_end) + 2e6),
                     breaks = seq(from = 0, to = max(gene.deldup.tmm.plot.relative.PRO$gene_end), by = 20000000),
                     labels = c("0", "20,000,000", "40,000,000", "60,000,000", "80,000,000", "100,000,000", "120,000,000", "140,000,000")) +
  expand_limits(x = 0, y = -1.5) +
  theme_classic() +
  coord_cartesian(clip = "off") +
  theme(
    text = element_text(family = "Arial", face = "bold"),
    plot.title = element_text(size = 28, face = "bold", hjust = 0.5, family = "Arial"),
    axis.title = element_text(size = 20, family = "Arial"),                
    axis.text = element_text(size = 18, family = "Arial"),
    axis.title.x = element_text(margin = margin(t = 20)),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(plot_object)

# 
# ggsave(
#   filename = "gene_expression_plot_log2_PROvsREV.png",
#   plot = plot_object,
#   width = 18,   # Adjust the width to make the canvas longer
#   height = 6,   # Adjust the height to control the aspect ratio
#   dpi = 300,     # Set the resolution to 300 dpi
#   bg = "transparent"
# )

# calculate mean expression
mean_values <- gene.deldup.tmm.plot.relative.PRO %>%
  group_by(region) %>%
  summarize(REV_mean = mean(REV, na.rm = TRUE),
            PRO_mean = mean(PRO, na.rm = TRUE)) %>%
  mutate(abs_fold_change = 2^(PRO_mean - REV_mean))

mean_values
region REV_mean PRO_mean abs_fold_change
8q 4.260991 4.337254 1.054284
Region between invdupdel 3.854454 4.007803 1.112148
Region from dup to centromere 4.889115 5.020112 1.095050
deletion 3.521351 2.723188 0.575081
duplication 4.353541 4.992940 1.557680

Figure 4B

# PRO vs REV
plot_tmp1 <- contr.rev$top_tags$table %>%
  mutate(region = case_when(
    gene_chr == "8" & gene_start >= 0 & gene_end < deletion_end ~ "deletion",
    gene_chr == "8" & gene_start >= duplication_start & gene_end < duplication_end ~ "duplication",
    TRUE ~ "other"
  )) %>%
  filter(FDR <= 0.05, abs(logFC) >= 1.0)

region_counts1 <- plot_tmp1 %>%
  group_by(region) %>%
  summarise(count = n())

# PRO vs Parents
plot_tmp2 <- contr.parents$top_tags$table %>%
  mutate(region = case_when(
    gene_chr == "8" & gene_start >= 0 & gene_start < deletion_end ~ "deletion",
    gene_chr == "8" & gene_start >= duplication_start & gene_start < duplication_end ~ "duplication",
    TRUE ~ "other"
  )) %>%
  filter(FDR <= 0.05, abs(logFC) >= 1.0)


region_counts2 <- plot_tmp2 %>%
  group_by(region) %>%
  summarise(count = n())

combined_counts <- region_counts1 %>%
  left_join(region_counts2, by = "region", suffix = c(".PROvsREV", ".PROvsParents"))

long_counts <- combined_counts %>%
  pivot_longer(cols = starts_with("count"), names_to = "dataset", values_to = "count")


# # Plot the counts side-by-side
# ggplot(long_counts, aes(x = region, y = count, fill = dataset)) +
#   geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
#   labs(title = "Region Counts - PRO vs Parents", x = "Region", y = "Count") +
#   theme_minimal() +
#   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
#   scale_fill_brewer(palette = "Set2") +
#   scale_y_log10() +
#   geom_text(aes(label = round(count, 2)), 
#             position = position_dodge(width = 0.8), vjust = -0.5, size = 3)

# Chi-square
region <- log2_tmm_data_with_annotations %>%
  mutate(region = case_when(
    gene_chr == "8" & gene_start >= 0 & gene_start < deletion_end ~ "deletion",
    gene_chr == "8" & gene_start >= duplication_start & gene_start < duplication_end ~ "duplication",
    TRUE ~ "other"
  ))

value_counts <- region %>%
  count(region, sort = TRUE)

# chi square
del <- value_counts %>%
  filter(region == "deletion") %>%
  pull(n)
dup <- value_counts %>%
  filter(region == "duplication") %>%
  pull(n)
other <- value_counts %>%
  filter(region == "other") %>%
  pull(n)

del.PRO.Parents <- region_counts2 %>%
  filter(region == "deletion") %>%
  pull(count)
del.PRO.REV <- region_counts1 %>%
  filter(region == "deletion") %>%
  pull(count)

dup.PRO.Parents <- region_counts2 %>%
  filter(region == "duplication") %>%
  pull(count)
dup.PRO.REV <- region_counts1 %>%
  filter(region == "duplication") %>%
  pull(count)

other.PRO.Parents <- region_counts2 %>%
  filter(region == "other") %>%
  pull(count)
other.PRO.REV <- region_counts1 %>%
  filter(region == "other") %>%
  pull(count)

contingency.del <- matrix(c(del.PRO.Parents, del - del.PRO.Parents, del.PRO.REV, del - del.PRO.REV),
                            nrow = 2, byrow = TRUE,
                            dimnames = list(c("PRO.Parents", "PRO.REV"),
                                            c("DE", "nonDE")))

contingency.dup <- matrix(c(dup.PRO.Parents, dup - dup.PRO.Parents, dup.PRO.REV, dup - dup.PRO.REV),
                            nrow = 2, byrow = TRUE,
                            dimnames = list(c("PRO.Parents", "PRO.REV"),
                                            c("DE", "nonDE")))


contingency.other <- matrix(c(other.PRO.Parents, other - other.PRO.Parents, other.PRO.REV, other - other.PRO.REV),
                            nrow = 2, byrow = TRUE,
                            dimnames = list(c("PRO.Parents", "PRO.REV"),
                                            c("DE", "nonDE")))
del_result <- chisq.test(contingency.del, correct = FALSE) %>% 
  broom::tidy() %>% 
  mutate(test = "Deletion")

dup_result <- chisq.test(contingency.dup, correct = FALSE) %>% 
  broom::tidy() %>% 
  mutate(test = "Duplication")

other_result <- chisq.test(contingency.other, correct = FALSE) %>% 
  broom::tidy() %>% 
  mutate(test = "Other")

chi_results <- bind_rows(del_result, dup_result, other_result) %>%
  mutate(label = paste0("p = ", round(p.value, digits = 3)))


chi_results
statistic p.value parameter method test label
0.1497326 0.6987910 1 Pearson’s Chi-squared test Deletion p = 0.699
6.5641026 0.0104056 1 Pearson’s Chi-squared test Duplication p = 0.01
652.5639951 0.0000000 1 Pearson’s Chi-squared test Other p = 0
p.adjust(chi_results$p.value)
## [1]  6.987910e-01  2.081124e-02 1.856161e-143
# Parents vs REV
plot_tmp3 <- contr.parent.rev$top_tags$table %>%
  mutate(region = case_when(
    gene_chr == "8" & gene_start >= 0 & gene_start < deletion_end ~ "deletion",
    gene_chr == "8" & gene_start >= duplication_start & gene_start < duplication_end ~ "duplication",
    TRUE ~ "other"
  )) %>%
  filter(FDR <= 0.05, abs(logFC) >= 1.0)

region_counts3 <- plot_tmp3 %>%
  group_by(region) %>%
  summarise(count = n())


combined_counts_3 <- region_counts1 %>%
  left_join(region_counts2, by = "region", suffix = c(".PROvsREV", ".PROvsParents")) %>%
  left_join(region_counts3, by = "region") %>%
  dplyr::rename(count.ParentsvsREV = count)

long_counts <- combined_counts_3 %>%
  pivot_longer(cols = starts_with("count"), names_to = "dataset", values_to = "count")


# Plot the counts side-by-side
ggplot(long_counts, aes(x = region, y = count, fill = dataset)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  labs(title = "Region Counts", x = "Region", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_brewer(palette = "Set2") +
  scale_y_log10() +
  geom_text(aes(label = round(count, 2)), 
            position = position_dodge(width = 0.9), vjust = -0.5, size = 3)

# overlap between PROVREV and ParentsREV
merged <- inner_join(plot_tmp3, plot_tmp1, by = "gene_id", suffix = c(".ParentsREV", ".PROREV"))

# Filter for same direction of logFC
same_direction <- merged %>%
  filter((logFC.ParentsREV > 0 & logFC.PROREV > 0) | (logFC.ParentsREV < 0 & logFC.PROREV < 0))

# Count total
n_total <- nrow(same_direction)

# Count by region
n_other       <- same_direction %>% filter(region.ParentsREV == "other" & region.PROREV == "other") %>% nrow()
n_deletion    <- same_direction %>% filter(region.ParentsREV == "deletion" & region.PROREV == "deletion") %>% nrow()
n_duplication <- same_direction %>% filter(region.ParentsREV == "duplication" & region.PROREV == "duplication") %>% nrow()

# Print counts
cat("Total same-direction intersect:", n_total, "\n",
    "Other region:", n_other, "\n",
    "Deletion region:", n_deletion, "\n",
    "Duplication region:", n_duplication, "\n")
## Total same-direction intersect: 86 
##  Other region: 82 
##  Deletion region: 1 
##  Duplication region: 3

Figure 4C

genes <- c("DLGAP2", "CSMD1","RHOBTB2", "CHRNA2", "GATA4", "XKR6")
plot.genelist <- c("MCPH1", "CLN8", "RHOBTB2", "UNC5D", "CNTN4", "NAV3")

plot_tmp <- contr.rev$top_tags$table %>%
  dplyr::select(gene_name, logFC, logCPM, FDR, gene_start, gene_end, gene_chr, gene_biotype) %>%
  mutate(region = case_when(
    gene_chr == "8" & gene_start >= 0 & gene_start < deletion_end ~ "deletion",
    gene_chr == "8" & gene_start >= duplication_start & gene_start < duplication_end ~ "duplication",
    TRUE ~ "other"
  )) %>%
  filter(FDR <= 0.05, gene_name %in% plot.genelist)

ggplot(plot_tmp, aes(x = gene_name, y = logFC, fill = region)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  labs(title = "Differential Expression by Region", 
       x = "Gene", y = "logFC") +
  theme_minimal() +
  geom_text(aes(label = round(logFC, 2)), 
            position = position_dodge(width = 0.9), 
            vjust = -0.5, size = 3) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Figure 4E

pathways <- c("REACTOME_DNA_METHYLATION", "REACTOME_ACTIVATION_OF_ANTERIOR_HOX_GENES_IN_HINDBRAIN_DEVELOPMENT_DURING_EARLY_EMBRYOGENESIS", "REACTOME_SEMA4D_INDUCED_CELL_MIGRATION_AND_GROWTH_CONE_COLLAPSE", "REACTOME_TRANSLATION")

for (id in pathways) {
    if (id %in% GSEA_H@result$ID) {
        name <- GSEA_H@result[id, "Description"]
        plot <- gseaplot2(GSEA_H, geneSetID = id)
        print(plot)
        print(id)
        #ggsave(filename = paste0("Enrichment_Plot_", id, "_H.png"), plot = plot, width = 4, height = 4, dpi = 300)
        
    } else if (id %in% GSEA_C2@result$ID) {
        name <- GSEA_C2@result[id, "Description"]
        plot <- gseaplot2(GSEA_C2, geneSetID = id)
        print(plot)
        print(id)
        # ggsave(filename = paste0("Enrichment_Plot_", id, "_C2.png"), plot = plot, width = 4, height = 4, dpi = 300)
        
    } else if (id %in% GSEA_C5@result$ID) {
        name <- GSEA_C5@result[id, "Description"]
        plot <- gseaplot2(GSEA_C5, geneSetID = id)
        print(plot)
        print(id)
        #ggsave(filename = paste0("Enrichment_Plot_", id, "_C5.png"), plot = plot, width = 4, height = 4, dpi = 300)

    } else {
        print(paste("Pathway", id, "not found in any GSEA results."))
    }
}

## [1] "REACTOME_DNA_METHYLATION"

## [1] "REACTOME_ACTIVATION_OF_ANTERIOR_HOX_GENES_IN_HINDBRAIN_DEVELOPMENT_DURING_EARLY_EMBRYOGENESIS"

## [1] "REACTOME_SEMA4D_INDUCED_CELL_MIGRATION_AND_GROWTH_CONE_COLLAPSE"

## [1] "REACTOME_TRANSLATION"

Supplementary Tables

DEgenes361 <- contr.rev$top_tags$table %>%
  filter(FDR <= 0.05, abs(logFC) >= 1.0) %>%
  mutate(region = case_when(
    gene_chr == "8" & gene_start >= 0 & gene_start < deletion_end ~ "deletion",
    gene_chr == "8" & gene_start >= duplication_start & gene_start < duplication_end ~ "duplication",
    TRUE ~ "other"
  ))
DEgenesAll <- contr.rev$top_tags$table %>%
  filter(FDR <= 0.05) %>%
  mutate(region = case_when(
    gene_chr == "8" & gene_start >= 0 & gene_start < deletion_end ~ "deletion",
    gene_chr == "8" & gene_start >= duplication_start & gene_start < duplication_end ~ "duplication",
    TRUE ~ "other"
  ))
#write_csv(DEgenesAll, "DEgenesPROvsREV-all.csv")

# genesAll <- contr.rev$top_tags$table %>%
#   mutate(region = case_when(
#     gene_chr == "8" & gene_start >= 0 & gene_start < deletion_end ~ "deletion",
#     gene_chr == "8" & gene_start >= duplication_start & gene_start < duplication_end ~ "duplication",
#     TRUE ~ "other"
#   ))
#write_csv(genesAll, "genesPROvsREV-all.csv")
# Table S2
gsea_h_df <- as.data.frame(GSEA_H@result) %>%
  left_join(H_t2g %>%
              dplyr::select(gs_name, gs_description) %>%
              distinct(), 
            by = c("ID" = "gs_name")) %>%
  mutate(geneset = "Hallmark")
gsea_c5_df <- as.data.frame(GSEA_C5@result) %>%
  left_join(C5_t2g%>%
              dplyr::select(gs_name, gs_description) %>%
              distinct(),
            by = c("ID" = "gs_name")) %>%
  mutate(geneset = "C5")
gsea_c2_df <- as.data.frame(GSEA_C2@result) %>%
  left_join(C2_t2g %>%
              dplyr::select(gs_name, gs_description) %>%
              distinct(), 
            by = c("ID" = "gs_name")) %>%
  mutate(geneset = "C2")
GSEA_result_PRO_versus_REV <- bind_rows(gsea_h_df, gsea_c2_df, gsea_c5_df) %>%
  arrange(enrichmentScore)

GSEA_table <- GSEA_result_PRO_versus_REV %>%
  dplyr::select(-c(Description, gs_description)) %>%
  dplyr::rename("MSigDB_gene_sets" = geneset)
# write_csv(GSEA_table, "gseaPROvsREV.csv")

Session Info and Citations

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggthemes_5.1.0         MetBrewer_0.2.0        biomaRt_2.60.1        
##  [4] msigdbr_7.5.1          enrichplot_1.24.4      org.Hs.eg.db_3.19.1   
##  [7] AnnotationDbi_1.66.0   IRanges_2.38.1         S4Vectors_0.42.1      
## [10] Biobase_2.64.0         BiocGenerics_0.50.0    clusterProfiler_4.12.6
## [13] edgeR_4.2.2            limma_3.60.6           lubridate_1.9.3       
## [16] forcats_1.0.0          stringr_1.5.1          dplyr_1.1.4           
## [19] purrr_1.0.2            readr_2.1.5            tidyr_1.3.1           
## [22] tibble_3.2.1           ggplot2_3.5.1          tidyverse_2.0.0       
## [25] readxl_1.4.3          
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      rstudioapi_0.17.1       jsonlite_1.8.9         
##   [4] magrittr_2.0.3          farver_2.1.2            rmarkdown_2.28         
##   [7] fs_1.6.5                zlibbioc_1.50.0         vctrs_0.6.5            
##  [10] memoise_2.0.1           ggtree_3.12.0           progress_1.2.3         
##  [13] htmltools_0.5.8.1       curl_5.2.3              broom_1.0.7            
##  [16] cellranger_1.1.0        gridGraphics_0.5-1      sass_0.4.9             
##  [19] bslib_0.8.0             plyr_1.8.9              httr2_1.0.5            
##  [22] cachem_1.1.0            igraph_2.1.1            lifecycle_1.0.4        
##  [25] pkgconfig_2.0.3         Matrix_1.7-1            R6_2.5.1               
##  [28] fastmap_1.2.0           gson_0.1.0              GenomeInfoDbData_1.2.12
##  [31] digest_0.6.37           aplot_0.2.3             colorspace_2.1-1       
##  [34] patchwork_1.3.0         RSQLite_2.3.7           labeling_0.4.3         
##  [37] filelock_1.0.3          fansi_1.0.6             timechange_0.3.0       
##  [40] httr_1.4.7              polyclip_1.10-7         compiler_4.4.1         
##  [43] bit64_4.5.2             withr_3.0.2             backports_1.5.0        
##  [46] BiocParallel_1.38.0     viridis_0.6.5           DBI_1.2.3              
##  [49] highr_0.11              ggforce_0.4.2           R.utils_2.12.3         
##  [52] MASS_7.3-61             rappdirs_0.3.3          tools_4.4.1            
##  [55] ape_5.8                 scatterpie_0.2.4        R.oo_1.26.0            
##  [58] glue_1.8.0              nlme_3.1-166            GOSemSim_2.30.2        
##  [61] shadowtext_0.1.4        reshape2_1.4.4          fgsea_1.30.0           
##  [64] generics_0.1.3          gtable_0.3.6            tzdb_0.4.0             
##  [67] R.methodsS3_1.8.2       data.table_1.16.2       hms_1.1.3              
##  [70] xml2_1.3.6              tidygraph_1.3.1         utf8_1.2.4             
##  [73] XVector_0.44.0          ggrepel_0.9.6           pillar_1.9.0           
##  [76] babelgene_22.9          yulab.utils_0.1.7       splines_4.4.1          
##  [79] tweenr_2.0.3            BiocFileCache_2.12.0    treeio_1.28.0          
##  [82] lattice_0.22-6          bit_4.5.0               tidyselect_1.2.1       
##  [85] GO.db_3.19.1            locfit_1.5-9.10         Biostrings_2.72.1      
##  [88] knitr_1.48              gridExtra_2.3           xfun_0.48              
##  [91] graphlayouts_1.2.0      statmod_1.5.0           stringi_1.8.4          
##  [94] UCSC.utils_1.0.0        lazyeval_0.2.2          ggfun_0.1.7            
##  [97] yaml_2.3.10             evaluate_1.0.1          codetools_0.2-20       
## [100] ggraph_2.2.1            qvalue_2.36.0           ggplotify_0.1.2        
## [103] cli_3.6.3               munsell_0.5.1           jquerylib_0.1.4        
## [106] Rcpp_1.0.13             GenomeInfoDb_1.40.1     dbplyr_2.5.0           
## [109] png_0.1-8               parallel_4.4.1          blob_1.2.4             
## [112] prettyunits_1.2.0       DOSE_3.30.5             viridisLite_0.4.2      
## [115] tidytree_0.4.6          scales_1.3.0            crayon_1.5.3           
## [118] rlang_1.1.4             cowplot_1.1.3           fastmatch_1.1-4        
## [121] KEGGREST_1.44.1
packages_in_use <- c( names( sessionInfo()$otherPkgs ) )
the_citations_list <- lapply( X=packages_in_use, FUN=citation)
the_citations_list
## [[1]]
## To cite package 'ggthemes' in publications use:
## 
##   Arnold J (2024). _ggthemes: Extra Themes, Scales and Geoms for
##   'ggplot2'_. R package version 5.1.0,
##   <https://CRAN.R-project.org/package=ggthemes>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {ggthemes: Extra Themes, Scales and Geoms for 'ggplot2'},
##     author = {Jeffrey B. Arnold},
##     year = {2024},
##     note = {R package version 5.1.0},
##     url = {https://CRAN.R-project.org/package=ggthemes},
##   }
## 
## [[2]]
## To cite package 'MetBrewer' in publications use:
## 
##   Mills BR (2022). _MetBrewer: Color Palettes Inspired by Works at the
##   Metropolitan Museum of Art_. R package version 0.2.0,
##   <https://CRAN.R-project.org/package=MetBrewer>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {MetBrewer: Color Palettes Inspired by Works at the Metropolitan Museum of
## Art},
##     author = {Blake Robert Mills},
##     year = {2022},
##     note = {R package version 0.2.0},
##     url = {https://CRAN.R-project.org/package=MetBrewer},
##   }
## 
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
## 
## [[3]]
## To cite the biomaRt package in publications use:
## 
##   Mapping identifiers for the integration of genomic datasets with the
##   R/Bioconductor package biomaRt. Steffen Durinck, Paul T. Spellman,
##   Ewan Birney and Wolfgang Huber, Nature Protocols 4, 1184-1191 (2009).
## 
##   BioMart and Bioconductor: a powerful link between biological
##   databases and microarray data analysis. Steffen Durinck, Yves Moreau,
##   Arek Kasprzyk, Sean Davis, Bart De Moor, Alvis Brazma and Wolfgang
##   Huber, Bioinformatics 21, 3439-3440 (2005).
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
## 
## [[4]]
## To cite package 'msigdbr' in publications use:
## 
##   Dolgalev I (2022). _msigdbr: MSigDB Gene Sets for Multiple Organisms
##   in a Tidy Data Format_. R package version 7.5.1,
##   <https://CRAN.R-project.org/package=msigdbr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {msigdbr: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format},
##     author = {Igor Dolgalev},
##     year = {2022},
##     note = {R package version 7.5.1},
##     url = {https://CRAN.R-project.org/package=msigdbr},
##   }
## 
## [[5]]
## To cite package 'enrichplot' in publications use:
## 
##   Yu G (2024). _enrichplot: Visualization of Functional Enrichment
##   Result_. doi:10.18129/B9.bioc.enrichplot
##   <https://doi.org/10.18129/B9.bioc.enrichplot>, R package version
##   1.24.4, <https://bioconductor.org/packages/enrichplot>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {enrichplot: Visualization of Functional Enrichment Result},
##     author = {Guangchuang Yu},
##     year = {2024},
##     note = {R package version 1.24.4},
##     url = {https://bioconductor.org/packages/enrichplot},
##     doi = {10.18129/B9.bioc.enrichplot},
##   }
## 
## [[6]]
## To cite package 'org.Hs.eg.db' in publications use:
## 
##   Carlson M (2024). _org.Hs.eg.db: Genome wide annotation for Human_. R
##   package version 3.19.1.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {org.Hs.eg.db: Genome wide annotation for Human},
##     author = {Marc Carlson},
##     year = {2024},
##     note = {R package version 3.19.1},
##   }
## 
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
## 
## [[7]]
## To cite package 'AnnotationDbi' in publications use:
## 
##   Pagès H, Carlson M, Falcon S, Li N (2024). _AnnotationDbi:
##   Manipulation of SQLite-based annotations in Bioconductor_.
##   doi:10.18129/B9.bioc.AnnotationDbi
##   <https://doi.org/10.18129/B9.bioc.AnnotationDbi>, R package version
##   1.66.0, <https://bioconductor.org/packages/AnnotationDbi>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {AnnotationDbi: Manipulation of SQLite-based annotations in Bioconductor},
##     author = {Hervé Pagès and Marc Carlson and Seth Falcon and Nianhua Li},
##     year = {2024},
##     note = {R package version 1.66.0},
##     url = {https://bioconductor.org/packages/AnnotationDbi},
##     doi = {10.18129/B9.bioc.AnnotationDbi},
##   }
## 
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
## 
## [[8]]
## To cite package 'IRanges' in publications use:
## 
##   Lawrence M, Huber W, Pag\`es H, Aboyoun P, Carlson M, et al. (2013)
##   Software for Computing and Annotating Genomic Ranges. PLoS Comput
##   Biol 9(8): e1003118. doi:10.1371/journal.pcbi.1003118
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Software for Computing and Annotating Genomic Ranges},
##     author = {Michael Lawrence and Wolfgang Huber and Herv\'e Pag\`es and Patrick Aboyoun and Marc Carlson and Robert Gentleman and Martin Morgan and Vincent Carey},
##     year = {2013},
##     journal = {{PLoS} Computational Biology},
##     volume = {9},
##     issue = {8},
##     doi = {10.1371/journal.pcbi.1003118},
##     url = {http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1003118},
##   }
## 
## [[9]]
## To cite package 'S4Vectors' in publications use:
## 
##   Pagès H, Lawrence M, Aboyoun P (2024). _S4Vectors: Foundation of
##   vector-like and list-like containers in Bioconductor_.
##   doi:10.18129/B9.bioc.S4Vectors
##   <https://doi.org/10.18129/B9.bioc.S4Vectors>, R package version
##   0.42.1, <https://bioconductor.org/packages/S4Vectors>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {S4Vectors: Foundation of vector-like and list-like containers in
## Bioconductor},
##     author = {Hervé Pagès and Michael Lawrence and Patrick Aboyoun},
##     year = {2024},
##     note = {R package version 0.42.1},
##     url = {https://bioconductor.org/packages/S4Vectors},
##     doi = {10.18129/B9.bioc.S4Vectors},
##   }
## 
## [[10]]
## To cite package 'Biobase' in publications use:
## 
##   Orchestrating high-throughput genomic analysis with Bioconductor. W.
##   Huber, V.J. Carey, R. Gentleman, ..., M. Morgan Nature Methods,
##   2015:12, 115.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     author = {W. Huber and V. J. Carey and R. Gentleman and S. Anders and M. Carlson and B. S. Carvalho and H. C. Bravo and S. Davis and L. Gatto and T. Girke and R. Gottardo and F. Hahne and K. D. Hansen and R. A. Irizarry and M. Lawrence and M. I. Love and J. MacDonald and V. Obenchain and A. K. {Ole's} and H. {Pag`es} and A. Reyes and P. Shannon and G. K. Smyth and D. Tenenbaum and L. Waldron and M. Morgan},
##     title = {{O}rchestrating high-throughput genomic analysis with {B}ioconductor},
##     journal = {Nature Methods},
##     year = {2015},
##     volume = {12},
##     number = {2},
##     pages = {115--121},
##     url = {http://www.nature.com/nmeth/journal/v12/n2/full/nmeth.3252.html},
##   }
## 
## [[11]]
## To cite package 'BiocGenerics' in publications use:
## 
##   Orchestrating high-throughput genomic analysis with Bioconductor. W.
##   Huber, V.J. Carey, R. Gentleman, ..., M. Morgan Nature Methods,
##   2015:12, 115.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     author = {{Huber} and {W.} and {Carey} and V. J. and {Gentleman} and {R.} and {Anders} and {S.} and {Carlson} and {M.} and {Carvalho} and B. S. and {Bravo} and H. C. and {Davis} and {S.} and {Gatto} and {L.} and {Girke} and {T.} and {Gottardo} and {R.} and {Hahne} and {F.} and {Hansen} and K. D. and {Irizarry} and R. A. and {Lawrence} and {M.} and {Love} and M. I. and {MacDonald} and {J.} and {Obenchain} and {V.} and {{Ole's}} and A. K. and {{Pag`es}} and {H.} and {Reyes} and {A.} and {Shannon} and {P.} and {Smyth} and G. K. and {Tenenbaum} and {D.} and {Waldron} and {L.} and {Morgan} and {M.}},
##     title = {{O}rchestrating high-throughput genomic analysis with {B}ioconductor},
##     journal = {Nature Methods},
##     year = {2015},
##     volume = {12},
##     number = {2},
##     pages = {115--121},
##     url = {http://www.nature.com/nmeth/journal/v12/n2/full/nmeth.3252.html},
##   }
## 
## [[12]]
## Please cite S. Xu (2024) for using clusterProfiler. In addition, please
## cite G. Yu (2010) when using GOSemSim, G. Yu (2015) when using DOSE and
## G. Yu (2015) when using ChIPseeker.
## 
##   S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R
##   Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
##   multiomics data. Nature Protocols. 2024,
##   doi:10.1038/s41596-024-01020-z
## 
##   T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L
##   Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
##   enrichment tool for interpreting omics data. The Innovation. 2021,
##   2(3):100141
## 
##   Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
##   clusterProfiler: an R package for comparing biological themes among
##   gene clusters. OMICS: A Journal of Integrative Biology 2012,
##   16(5):284-287
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
## 
## [[13]]
## See Section 1.2 in the User's Guide for more detail about how to cite
## the different edgeR pipelines.
## 
##   Chen Y, Chen L, Lun ATL, Baldoni PL, Smyth GK (2024). edgeR 4.0:
##   powerful differential analysis of sequencing data with expanded
##   functionality and improved support for small counts and larger
##   datasets. bioRxiv doi: 10.1101/2024.01.21.576131
## 
##   Chen Y, Lun ATL, Smyth GK (2016). From reads to genes to pathways:
##   differential expression analysis of RNA-Seq experiments using
##   Rsubread and the edgeR quasi-likelihood pipeline. F1000Research 5,
##   1438
## 
##   McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression
##   analysis of multifactor RNA-Seq experiments with respect to
##   biological variation. Nucleic Acids Research 40(10), 4288-4297
## 
##   Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
##   package for differential expression analysis of digital gene
##   expression data. Bioinformatics 26(1), 139-140
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
## 
## [[14]]
## To cite package 'limma' in publications use:
## 
##   Ritchie, M.E., Phipson, B., Wu, D., Hu, Y., Law, C.W., Shi, W., and
##   Smyth, G.K. (2015). limma powers differential expression analyses for
##   RNA-sequencing and microarray studies. Nucleic Acids Research 43(7),
##   e47.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     author = {Matthew E Ritchie and Belinda Phipson and Di Wu and Yifang Hu and Charity W Law and Wei Shi and Gordon K Smyth},
##     title = {{limma} powers differential expression analyses for {RNA}-sequencing and microarray studies},
##     journal = {Nucleic Acids Research},
##     year = {2015},
##     volume = {43},
##     number = {7},
##     pages = {e47},
##     doi = {10.1093/nar/gkv007},
##   }
## 
## [[15]]
## To cite lubridate in publications use:
## 
##   Garrett Grolemund, Hadley Wickham (2011). Dates and Times Made Easy
##   with lubridate. Journal of Statistical Software, 40(3), 1-25. URL
##   https://www.jstatsoft.org/v40/i03/.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Dates and Times Made Easy with {lubridate}},
##     author = {Garrett Grolemund and Hadley Wickham},
##     journal = {Journal of Statistical Software},
##     year = {2011},
##     volume = {40},
##     number = {3},
##     pages = {1--25},
##     url = {https://www.jstatsoft.org/v40/i03/},
##   }
## 
## [[16]]
## To cite package 'forcats' in publications use:
## 
##   Wickham H (2023). _forcats: Tools for Working with Categorical
##   Variables (Factors)_. R package version 1.0.0,
##   <https://CRAN.R-project.org/package=forcats>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {forcats: Tools for Working with Categorical Variables (Factors)},
##     author = {Hadley Wickham},
##     year = {2023},
##     note = {R package version 1.0.0},
##     url = {https://CRAN.R-project.org/package=forcats},
##   }
## 
## [[17]]
## To cite package 'stringr' in publications use:
## 
##   Wickham H (2023). _stringr: Simple, Consistent Wrappers for Common
##   String Operations_. R package version 1.5.1,
##   <https://CRAN.R-project.org/package=stringr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {stringr: Simple, Consistent Wrappers for Common String Operations},
##     author = {Hadley Wickham},
##     year = {2023},
##     note = {R package version 1.5.1},
##     url = {https://CRAN.R-project.org/package=stringr},
##   }
## 
## [[18]]
## To cite package 'dplyr' in publications use:
## 
##   Wickham H, François R, Henry L, Müller K, Vaughan D (2023). _dplyr: A
##   Grammar of Data Manipulation_. R package version 1.1.4,
##   <https://CRAN.R-project.org/package=dplyr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {dplyr: A Grammar of Data Manipulation},
##     author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller and Davis Vaughan},
##     year = {2023},
##     note = {R package version 1.1.4},
##     url = {https://CRAN.R-project.org/package=dplyr},
##   }
## 
## [[19]]
## To cite package 'purrr' in publications use:
## 
##   Wickham H, Henry L (2023). _purrr: Functional Programming Tools_. R
##   package version 1.0.2, <https://CRAN.R-project.org/package=purrr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {purrr: Functional Programming Tools},
##     author = {Hadley Wickham and Lionel Henry},
##     year = {2023},
##     note = {R package version 1.0.2},
##     url = {https://CRAN.R-project.org/package=purrr},
##   }
## 
## [[20]]
## To cite package 'readr' in publications use:
## 
##   Wickham H, Hester J, Bryan J (2024). _readr: Read Rectangular Text
##   Data_. R package version 2.1.5,
##   <https://CRAN.R-project.org/package=readr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {readr: Read Rectangular Text Data},
##     author = {Hadley Wickham and Jim Hester and Jennifer Bryan},
##     year = {2024},
##     note = {R package version 2.1.5},
##     url = {https://CRAN.R-project.org/package=readr},
##   }
## 
## [[21]]
## To cite package 'tidyr' in publications use:
## 
##   Wickham H, Vaughan D, Girlich M (2024). _tidyr: Tidy Messy Data_. R
##   package version 1.3.1, <https://CRAN.R-project.org/package=tidyr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {tidyr: Tidy Messy Data},
##     author = {Hadley Wickham and Davis Vaughan and Maximilian Girlich},
##     year = {2024},
##     note = {R package version 1.3.1},
##     url = {https://CRAN.R-project.org/package=tidyr},
##   }
## 
## [[22]]
## To cite package 'tibble' in publications use:
## 
##   Müller K, Wickham H (2023). _tibble: Simple Data Frames_. R package
##   version 3.2.1, <https://CRAN.R-project.org/package=tibble>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {tibble: Simple Data Frames},
##     author = {Kirill Müller and Hadley Wickham},
##     year = {2023},
##     note = {R package version 3.2.1},
##     url = {https://CRAN.R-project.org/package=tibble},
##   }
## 
## [[23]]
## To cite ggplot2 in publications, please use
## 
##   H. Wickham. ggplot2: Elegant Graphics for Data Analysis.
##   Springer-Verlag New York, 2016.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Book{,
##     author = {Hadley Wickham},
##     title = {ggplot2: Elegant Graphics for Data Analysis},
##     publisher = {Springer-Verlag New York},
##     year = {2016},
##     isbn = {978-3-319-24277-4},
##     url = {https://ggplot2.tidyverse.org},
##   }
## 
## [[24]]
## To cite package 'tidyverse' in publications use:
## 
##   Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R,
##   Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller
##   E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V,
##   Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to
##   the tidyverse." _Journal of Open Source Software_, *4*(43), 1686.
##   doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Welcome to the {tidyverse}},
##     author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
##     year = {2019},
##     journal = {Journal of Open Source Software},
##     volume = {4},
##     number = {43},
##     pages = {1686},
##     doi = {10.21105/joss.01686},
##   }
## 
## [[25]]
## To cite package 'readxl' in publications use:
## 
##   Wickham H, Bryan J (2023). _readxl: Read Excel Files_. R package
##   version 1.4.3, <https://CRAN.R-project.org/package=readxl>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {readxl: Read Excel Files},
##     author = {Hadley Wickham and Jennifer Bryan},
##     year = {2023},
##     note = {R package version 1.4.3},
##     url = {https://CRAN.R-project.org/package=readxl},
##   }