# Min/Max values needed to recode the coordinates
min_5utr_start <- ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`) |>
dplyr::reframe(min_5_utr_start = min(`5_utr_start`, na.rm = TRUE), group = `ensembl_gene_id`) |>
unique()
min_5utr_start
ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`)
ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`) |>
pivot_longer()
ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`) |>
pivot_longer(`ensembl_gene_id`)
ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`) |>
pivot_short(`ensembl_gene_id`)
ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`) |>
pivot_wider(`ensembl_gene_id`)
ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`)
ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`) |>
pivot_wider(ensembl_gene_id)
ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`) |>
pivot_wider(name= ensembl_gene_id)
library(biomaRt)
library(tidyverse)
# Connect to ENSEMBL
ensembl <-   biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# Gen ENSEMBL info
attributes <- c(
"external_gene_name",
"ensembl_gene_id",
"ensembl_transcript_id",
"chromosome_name",
"strand",
"5_utr_start",
"5_utr_end",
"3_utr_start",
"3_utr_end",
"cds_start",
"cds_end"
)
ensembl_coordinates <- biomaRt::getBM(attributes = attributes,
filters = c("transcript_biotype"),
values = c("protein_coding"),
mart = ensembl)
# Filter for only transcripts in canonical nuclear chromosomes
ensembl_coordinates <- ensembl_coordinates |>
filter(chromosome_name %in% c(1:22, "Y", "X"))
ensembl_coordinates
ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`) |>
pivot_wider(ensembl_gene_id)
ensembl_coordinates |>
dplyr::as_tibble()
ensembl_coordinates |>
dplyr::as_tibble() |>
select(ensembl_gene_id, `5_utr_start`)
ensembl_coordinates |>
dplyr::as_tibble() |>
dplyr::select(ensembl_gene_id, `5_utr_start`)
ensembl_coordinates |>
dplyr::as_tibble() |>
dplyr::select(ensembl_gene_id, `5_utr_start`) |>
group_by(ensembl_gene_id) |>
summarize(min_value = min(5_utr_start), .groups = "drop")
ensembl_coordinates |>
dplyr::as_tibble() |>
dplyr::select(ensembl_gene_id, `5_utr_start`) |>
group_by(ensembl_gene_id) |>
summarize(min_value = min(`5_utr_start`), .groups = "drop")
ensembl_coordinates |>
dplyr::as_tibble() |>
dplyr::select(ensembl_gene_id, `5_utr_start`) |>
group_by(ensembl_gene_id) |>
summarize(min_value = min(`5_utr_start`, na.rm = TRUE), .groups = "drop")
# Min/Max values needed to recode the coordinates
min_5utr_start <- ensembl_coordinates |>
dplyr::as_tibble() |>
dplyr::select(ensembl_gene_id, `5_utr_start`) |>
group_by(ensembl_gene_id) |>
summarize(min_5utr_start = min(`5_utr_start`, na.rm = TRUE), .groups = "drop")
min_5utr_end <- ensembl_coordinates |>
dplyr::as_tibble() |>
dplyr::select(ensembl_gene_id, `5_utr_end`) |>
group_by(ensembl_gene_id) |>
summarize(min_5utr_end = min(`5_utr_end`, na.rm = TRUE), .groups = "drop")
min_5utr_end
ensembl_coordinates
max_CDS_start <- ensembl_coordinates |>
dplyr::as_tibble() |>
dplyr::select(ensembl_gene_id, `cds_start`) |>
group_by(ensembl_gene_id) |>
summarize(max_CDS_start = min(`cds_start`, na.rm = TRUE), .groups = "drop")
max_CDS_start
ensembl_coordinates
attributePages()
attributePages(ensembl)
attributes(ensembl)
# Add required GTF fields
gtf_data_cds <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "CDS",
start = cds_start,
end = cds_end,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
# View formatted data
head(gtf_data_cds)
# Add required GTF fields
gtf_data_cds <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "CDS",
start = cds_start,
end = cds_end,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
# View formatted data
head(gtf_data_cds)
# Add required GTF fields
gtf_data_cds <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "CDS",
start = cds_start,
end = cds_end,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
# View formatted data
head(gtf_data_cds)
gtf_data_cds
# View formatted data
head(gtf_data_cds)
listAttributes(ensembl)
# Gen ENSEMBL info
attributes <- c(
"external_gene_name",
"ensembl_gene_id",
"ensembl_transcript_id",
"chromosome_name",
"strand",
"5_utr_start",
"5_utr_end",
"3_utr_start",
"3_utr_end",
"genomic_coding_start",
"genomic_coding_end"
)
ensembl_coordinates <- biomaRt::getBM(attributes = attributes,
filters = c("transcript_biotype"),
values = c("protein_coding"),
mart = ensembl)
# Add required GTF fields
gtf_data_cds <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "CDS",
start = genomic_coding_start,
end = genomic_coding_end,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
# View formatted data
head(gtf_data_cds)
gtf_data_utr5 <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "UTR5",
start = `5_utr_start`,
end = `5_utr_end`,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
gtf_data_utr5
ensembl_coordinates_all <- biomaRt::getBM(attributes = attributes,
filters = c("transcript_biotype"),
values = c("protein_coding"),
mart = ensembl)
# Limit to canonical nuclear chromosomes
ensembl_coordinates <- ensembl_coordinates_all |>
filter(chromosome_name %in% c(1:22, "X", "Y"))
# Add required GTF fields
gtf_data_cds <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "CDS",
start = genomic_coding_start,
end = genomic_coding_end,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
gtf_data_utr5 <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "UTR5",
start = `5_utr_start`,
end = `5_utr_end`,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
# View formatted data
head(gtf_data_cds)
gtf_data_utr3 <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "UTR3",
start = `3_utr_start`,
end = `3_utr_end`,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3)
# Save the results to a file
write.csv(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), "UTR_CDS_coordinates."csv"gtf", row.names = FALSE)
# Add required GTF fields
gtf_data_cds <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "CDS",
start = genomic_coding_start,
end = genomic_coding_end,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
gtf_data_utr5 <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "UTR5",
start = `5_utr_start`,
end = `5_utr_end`,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
gtf_data_utr3 <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "UTR3",
start = `3_utr_start`,
end = `3_utr_end`,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
# Save the results to a file
write.csv(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), "UTR_CDS_coordinates.gtf", row.names = FALSE)
# Save the results to a file
write.csv(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), "UTR_CDS_coordinates.gtf", row.names = FALSE, quote = FALSE)
# Save the results to a file
write.csv(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), "UTR_CDS_coordinates.gtf", row.names = FALSE, quote = FALSE, sep = "\t")
# Save the results to a file
write.table(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), "UTR_CDS_coordinates.gtf", row.names = FALSE, quote = FALSE, sep = "\t")
# Add required GTF fields
gtf_data_cds <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "CDS",
start = genomic_coding_start,
end = genomic_coding_end,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
filter(!is.na(genomic_coding_start)) |>
filter(!is.na(genomic_coding_end)) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
# Add required GTF fields
gtf_data_cds <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "CDS",
start = genomic_coding_start,
end = genomic_coding_end,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
filter(!is.na(genomic_coding_start)) |>
filter(!is.na(genomic_coding_end)) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
gtf_data_utr5 <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "UTR5",
start = `5_utr_start`,
end = `5_utr_end`,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
filter(!is.na(`5_utr_start`)) |>
filter(!is.na(`5_utr_end`)) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
gtf_data_utr3 <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "UTR3",
start = `3_utr_start`,
end = `3_utr_end`,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
filter(!is.na(`3_utr_start`)) |>
filter(!is.na(`3_utr_end`)) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
# Save the results to a file
write.table(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), "UTR_CDS_coordinates.gtf", row.names = FALSE, quote = FALSE, sep = "\t")
# Save the results to a file
write.table(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), gzfile("UTR_CDS_coordinates.gtf.gz""), row.names = FALSE, quote = FALSE, sep = "\t")
# Save the results to a file
write.table(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), gzfile("UTR_CDS_coordinates.gtf.gz"), row.names = FALSE, quote = FALSE, sep = "\t")
# Save the results to a file
write.table(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), gzfile("UTR_CDS_coordinates.gtf.gz"), row.names = FALSE, quote = FALSE, sep = "\t", header = FALSE)
library(biomaRt)
library(tidyverse)
# Connect to ENSEMBL
ensembl <-   biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# Gen ENSEMBL info
attributes <- c(
"external_gene_name",
"ensembl_gene_id",
"ensembl_transcript_id",
"chromosome_name",
"strand",
"5_utr_start",
"5_utr_end",
"3_utr_start",
"3_utr_end",
"genomic_coding_start",
"genomic_coding_end"
)
ensembl_coordinates_all <- biomaRt::getBM(attributes = attributes,
filters = c("transcript_biotype"),
values = c("protein_coding"),
mart = ensembl)
# Limit to canonical nuclear chromosomes
ensembl_coordinates <- ensembl_coordinates_all |>
filter(chromosome_name %in% c(1:22, "X", "Y"))
# Add required GTF fields
gtf_data_cds <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "CDS",
start = genomic_coding_start,
end = genomic_coding_end,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
filter(!is.na(genomic_coding_start)) |>
filter(!is.na(genomic_coding_end)) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
gtf_data_utr5 <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "UTR5",
start = `5_utr_start`,
end = `5_utr_end`,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
filter(!is.na(`5_utr_start`)) |>
filter(!is.na(`5_utr_end`)) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
gtf_data_utr3 <- ensembl_coordinates |>
mutate(
seqname = chromosome_name,
source = "Ensembl",
feature = "UTR3",
start = `3_utr_start`,
end = `3_utr_end`,
score = ".",
strand = ifelse(strand == 1, "+", "-"),
frame = ".",
attribute = paste0(
'gene_id "', ensembl_gene_id, '"; ',
'gene_name "', external_gene_name, '"; '
)
) |>
filter(!is.na(`3_utr_start`)) |>
filter(!is.na(`3_utr_end`)) |>
select(seqname, source, feature, start, end, score, strand, frame, attribute)
# Save the results to a file
write.table(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), gzfile("UTR_CDS_coordinates.gtf.gz"), row.names = FALSE, quote = FALSE, sep = "\t", header = FALSE)
# Save the results to a file
write.table(rbind(gtf_data_cds, gtf_data_utr5, gtf_data_utr3), gzfile("UTR_CDS_coordinates.gtf.gz"), row.names = FALSE, quote = FALSE, sep = "\t", col.names = FALSE)
