<!DOCTYPE html>
library(tidyverse)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mggplot2[30m 3.2.1 [32m✓[30m [34mpurrr [30m 0.3.3
[32m✓[30m [34mtibble [30m 2.1.3 [32m✓[30m [34mdplyr [30m 0.8.3
[32m✓[30m [34mtidyr [30m 1.0.0 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.3.1 [32m✓[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(viridis)
Loading required package: viridisLite
ortho_counts <- read.table("Orthogroups.GeneCount_dec_20.csv")
ortho_binary <- ortho_counts[,1:49] #Remove the Totals column
ortho_binary[ortho_binary >= 1] <- 1 #Convert to presence/absence
taxonomy_phyla <- read_csv("taxonomy_run3_phyla.csv")
Parsed with column specification:
cols(
species = [31mcol_character()[39m,
clade = [31mcol_character()[39m,
phylum = [31mcol_character()[39m
)
taxonomy_phyla$phylum <- factor(taxonomy_phyla$phylum)
#taxonomy_phyla$species <- factor(taxonomy_phyla$species)
taxonomy_phyla
phylum_specific_genes <- data.frame(species = character(0), phylum_specific_genes = double(0))
for (i in seq_along(levels(taxonomy_phyla$phylum))){
phylum <- levels(taxonomy_phyla$phylum)[i]
species_in <- taxonomy_phyla[taxonomy_phyla$phylum == phylum, "species"]
species_in <- species_in$species
select_species <- ortho_binary %>% select(species_in)
other_species <- ortho_binary %>% select(-species_in)
specific_ortho_at_least_one <- row.names(ortho_binary[rowSums(select_species) >= 1 & rowSums(other_species) == 0,])
phylum_specific_counts <- numeric(0)
for (i in 1: length(species_in)){
phylum_specific_counts[i] <- (sum(ortho_counts[specific_ortho_at_least_one, species_in[i]]))
}
phylum_df <- data.frame(species = species_in, phylum_specific_genes = phylum_specific_counts)
phylum_specific_genes <- rbind(phylum_specific_genes, phylum_df)
}
phylum_specific_genes
stats_per_species <- read_tsv("Statistics_PerSpecies_Dec_20.csv")
stats_per_species
species <- colnames(stats_per_species)
species <- species[2:50]
total_genes <- stats_per_species %>% slice(1) %>% unlist(., use.names=FALSE)
total_genes <- as.numeric(total_genes[2:50])
unassigned_genes <- stats_per_species %>% slice(3) %>% unlist(., use.names=FALSE)
unassigned_genes <- as.numeric(unassigned_genes[2:50])
unassigned_genes
[1] 5278 1201 16167 4339 2543 5260 7513 6581 2050 1654 4397 819 2527 5314 1383 2966 1124 815
[19] 6451 8696 538 2092 7236 2919 1246 6818 2414 5716 4571 1925 5987 4343 2014 8375 1744 1005
[37] 3093 550 12100 4972 2081 2012 2775 2040 8586 6924 1635 972 1277
percent_unassigned <- stats_per_species %>% slice(5) %>% unlist(., use.names=FALSE)
percent_unassigned <- as.numeric(percent_unassigned[2:50])
unassigned <- tibble(species, total_genes, unassigned_genes, percent_unassigned)
unassigned
phylum_specific_genes$species <- factor(phylum_specific_genes$species, levels = rev(taxonomy_phyla$species))
unassigned$species <- factor(unassigned$species, levels = rev(taxonomy_phyla$species))
phylum_spec_and_unassigned <- left_join(phylum_specific_genes, unassigned, by = "species")
phylum_spec_and_unassigned
df_new <- phylum_spec_and_unassigned %>%
mutate(other_genes = total_genes - phylum_specific_genes - unassigned_genes) %>%
select(species, phylum_specific_genes, unassigned_genes, other_genes)
df_new
df_long <- df_new %>%
gather(`phylum_specific_genes`, `unassigned_genes`, `other_genes`,
key = "gene_category",
value = "gene_count") %>%
mutate(gene_category = factor(gene_category, levels = c("other_genes","phylum_specific_genes", "unassigned_genes" )))
df_long
gene_count_plot <- ggplot(data = df_long, aes(x = species, y = gene_count, fill = gene_category)) +
geom_bar(stat="identity") +
coord_flip()
species_names <- gsub("_", " ", taxonomy_phyla$species)
species_names <- paste(toupper(substr(species_names, 1, 1)), substr(species_names, 2, nchar(species_names)), sep="")
species_names
[1] "Danio rerio" "Xenopus tropicalis" "Gallus gallus"
[4] "Homo sapiens" "Ciona intestinalis" "Branchiostoma floridae"
[7] "Saccoglossus kowalevskii" "Acanthaster planci" "Pristionchus pacificus"
[10] "Caenorhabditis elegans" "Drosophila melanogaster" "Tribolium castaneum"
[13] "Daphnia pulex" "Ixodes scapularis" "Lottia gigantea"
[16] "Octopus bimaculoides" "Phoronis australis" "Notospermus geniculatus"
[19] "Schistosoma mansoni" "Schmidtea mediterranea" "Capitella teleta"
[22] "Helobdella robusta" "Exaiptasia pallida" "Nematostella vectensis"
[25] "Orbicella faveolata" "Acropora digitifera" "Pocillopora damicornis"
[28] "Renilla muelleri" "Dendronephthya gigantea" "Aurelia aurita atlantic"
[31] "Aurelia aurita pacific" "Nemopilema nomurai" "Hydractinia echinata"
[34] "Hydractinia symbiolongicarpus" "Hydra magnipapillata" "Clytia hemisphaerica"
[37] "Morbakka virulenta" "Kudoa iwatai" "Hofstenia miamia"
[40] "Hoilungia hongkongensis" "Trichoplax adhaerens" "Amphimedon queenslandica"
[43] "Mnemiopsis leidyi" "Capsaspora owczarzaki" "Creolimax fragrantissima"
[46] "Monosiga brevicolis" "Saccharomyces cerevisiae" "Salpingoeca rosetta"
[49] "Sphaeroforma arctica"
gene_count_plot +
scale_fill_viridis(option="viridis",discrete = TRUE, labels = c("Other", "Phylum-specific", "Unassigned")) + # Color + legend names spelled out
guides(fill = guide_legend(reverse = TRUE)) + # Reverse legend labels so that other is not first
labs(y = "Gene count") + # y axis label
scale_x_discrete(name = "Species", labels = rev(species_names)) + # clean up x-axis label
theme(axis.text.y = element_text(size=7, color = rev(taxonomy_phyla$phylum))) # color species names by phylum (ugly but useful)
df_percent <- phylum_spec_and_unassigned %>%
mutate(percent_phylum_specific = (phylum_specific_genes/total_genes)*100,
percent_other_genes = (total_genes - phylum_specific_genes - unassigned_genes)*100/total_genes) %>%
select(species, percent_phylum_specific, percent_unassigned, percent_other_genes)
df_percent
df_percent_long <- df_percent %>%
gather(`percent_phylum_specific`, `percent_unassigned`, `percent_other_genes`,
key = "gene_category",
value = "percent_genes") %>%
mutate(gene_category = factor(gene_category, levels = c("percent_other_genes","percent_phylum_specific", "percent_unassigned" )))
df_percent_long
percent_plot <- ggplot(data = df_percent_long, aes(x = species, y = percent_genes, fill = gene_category)) +
geom_bar(stat="identity") +
coord_flip()
percent_plot +
scale_fill_viridis(option="viridis",discrete = TRUE, labels = c("Other", "Phylum-specific", "Unassigned")) +
labs(y = "%") +
guides(fill = guide_legend(reverse = TRUE)) +
scale_x_discrete(name = "Species", labels = rev(species_names)) +
theme(axis.text.y = element_text(size=7, color = rev(taxonomy_phyla$phylum)))