<!DOCTYPE html>

Orthogroup annotation
library("tidyverse")

Read orthogroups

orthogroups <- read_tsv("Orthogroups_Dec_20.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
  .default = col_character()
)
See spec(...) for full column specifications.
colnames(orthogroups)[1] <- "orthogroup_id"
orthogroups

Read PANNZER annotations

descriptions_original <- read_tsv("~/research/orthogroup_parsing/pannzer_annotations/combined_descriptions.tsv", col_names = TRUE, skip_empty_rows = TRUE)
descriptions <- descriptions_original %>% select(qpid, desc, genename)
colnames(descriptions) <- c("gene_id", "pannzer_description", "pannzer_gene_name")
descriptions
go_terms <- read_tsv("~/research/orthogroup_parsing/pannzer_annotations/combined_GO_terms.tsv", col_names = TRUE, skip_empty_rows = TRUE)
go_MF <- go_terms[go_terms$ARGOT_rank == 1,] %>% filter(ontology == "MF") %>% select(qpid, desc)
colnames(go_MF) <-  c("gene_id", "molecular_function")
go_BP <-go_terms[go_terms$ARGOT_rank == 1,] %>% filter(ontology == "BP") %>% select(qpid, desc)
colnames(go_BP) <-  c("gene_id", "biological_processes")
go_CC <- go_terms[go_terms$ARGOT_rank == 1,] %>% filter(ontology == "CC") %>% select(qpid, desc)
colnames(go_CC) <-  c("gene_id", "cellular_component")
annotations <- left_join(descriptions, go_MF, by = "gene_id") %>% left_join(go_BP, by = "gene_id") %>% left_join(go_CC, by = "gene_id")
annotations
remove(go_terms)

Read diamond annotations

diamond_annotations <-read_tsv("all_best_hits.diamond.blastp", col_names = c("gene_id", "best_BLAST_hit", "eval"))
diamond_annotations

Read unassigned orthogroups

unassigned_orthogroups <- read_tsv("Orthogroups_UnassignedGenes_Dec_20.csv", col_types = cols(.default = "c"))
Missing column names filled in: 'X1' [1]
colnames(unassigned_orthogroups)[1] <- "orthogroup_id"
unassigned_orthogroups

Annotate all unassigned genes (all species)

species <- "acanthaster_planci"
unassigned_orthogroups[,species]
acanthaster_unassigned <- unassigned_orthogroups[!is.na(unassigned_orthogroups$acanthaster_planci), "acanthaster_planci"]
colnames(acanthaster_unassigned) <- "gene_id"
acanthaster_unassigned
acanthaster_unassigned <- left_join(acanthaster_unassigned, annotations, by = "gene_id") %>% left_join(diamond_annotations, by = "gene_id")

Calculate: Number of genes with no BLAST hit, genes with uncharacterized blast hit, and others.

nrow(acanthaster_unassigned)
[1] 5278
sum(!is.na(acanthaster_unassigned$best_BLAST_hit))
[1] 1944
acanthaster_unassigned[grep("uncharacterized", tolower(acanthaster_unassigned$best_BLAST_hit)), ]
count_unassigned <- function(unassigned){
  species_names <- colnames(unassigned[,-1])
  print(species_names)
  output <- data.frame()
  for(species_name in species_names){
    #print(species_name)
    species_unassigned <- unassigned[!is.na(unassigned[,species_name]), species_name]
    colnames(species_unassigned) <- "gene_id"
    species_unassigned <- left_join(species_unassigned, annotations, by = "gene_id") %>% left_join(diamond_annotations, by = "gene_id")
    unassigned_num <- nrow(species_unassigned)
    no_blast <- sum(is.na(species_unassigned$best_BLAST_hit))
    percent_no_blast <- round(100*no_blast/unassigned_num, digits = 1)
    blast_hit_uncharacterized <- nrow(species_unassigned[grep("uncharacterized", tolower(species_unassigned$best_BLAST_hit)), ])
    blast_hit_hypothetical <- nrow(species_unassigned[grep("hypothetical", tolower(species_unassigned$best_BLAST_hit)), ])
    uncharacterized = sum(blast_hit_uncharacterized,blast_hit_hypothetical )
    percent_uncharacterized <- round(100*uncharacterized/unassigned_num, digits = 1)
    characterized <- unassigned_num - (no_blast + uncharacterized)
    percent_characterized <- round(100*characterized/unassigned_num, digits = 1)
    df <- data.frame(species = species_name, unassigned_genes = unassigned_num, no_blast_hit = no_blast, uncharacterized_blast_hit = uncharacterized, characterized_blast_hit = characterized,
                     percent_no_blast_hit = percent_no_blast, percent_uncharacterized = percent_uncharacterized, percent_characterized = percent_characterized)
    output <- rbind(output, df)
  }
  output
}
unassigned_stats <- count_unassigned(unassigned_orthogroups)
 [1] "acanthaster_planci"            "acropora_digitifera"           "amphimedon_queenslandica"     
 [4] "aurelia_aurita_atlantic"       "aurelia_aurita_pacific"        "branchiostoma_floridae"       
 [7] "caenorhabditis_elegans"        "capitella_teleta"              "capsaspora_owczarzaki"        
[10] "ciona_intestinalis"            "clytia_hemisphaerica"          "creolimax_fragrantissima"     
[13] "danio_rerio"                   "daphnia_pulex"                 "dendronephthya_gigantea"      
[16] "drosophila_melanogaster"       "exaiptasia_pallida"            "gallus_gallus"                
[19] "helobdella_robusta"            "hofstenia_miamia"              "hoilungia_hongkongensis"      
[22] "homo_sapiens"                  "hydra_magnipapillata"          "hydractinia_echinata"         
[25] "hydractinia_symbiolongicarpus" "ixodes_scapularis"             "kudoa_iwatai"                 
[28] "lottia_gigantea"               "mnemiopsis_leidyi"             "monosiga_brevicolis"          
[31] "morbakka_virulenta"            "nematostella_vectensis"        "nemopilema_nomurai"           
[34] "notospermus_geniculatus"       "octopus_bimaculoides"          "orbicella_faveolata"          
[37] "phoronis_australis"            "pocillopora_damicornis"        "pristionchus_pacificus"       
[40] "renilla_muelleri"              "saccharomyces_cerevisiae"      "saccoglossus_kowalevskii"     
[43] "salpingoeca_rosetta"           "schistosoma_mansoni"           "schmidtea_mediterranea"       
[46] "sphaeroforma_arctica"          "tribolium_castaneum"           "trichoplax_adhaerens"         
[49] "xenopus_tropicalis"           
unassigned_stats
#write_tsv(unassigned_stats, path = "unassigned_stats.tsv")

Who are the species with > 50 % characterized unassigned genes.

Look at species with a lot of characterized blast hits

danio <- unassigned_orthogroups[!is.na(unassigned_orthogroups[,"danio_rerio"]), "danio_rerio"]
colnames(danio) <- "gene_id"
danio_annot <- left_join(danio, annotations, by = "gene_id") %>% left_join(diamond_annotations, by = "gene_id")

Output hydractinia unassigned genes

symbio_unassigned <- unassigned_orthogroups[!is.na(unassigned_orthogroups[,"hydractinia_symbiolongicarpus"]), "hydractinia_symbiolongicarpus"]
write_tsv(symbio_unassigned, "symbio_unassigned.txt")
symbio_unassigned
colnames(symbio_unassigned) <- "gene_id"
symbio_unassigned_annot <- left_join(symbio_unassigned, annotations, by = "gene_id") %>% left_join(diamond_annotations, by = "gene_id")
symbio_unassigned_annot
echinata_unassigned <- unassigned_orthogroups[!is.na(unassigned_orthogroups[,"hydractinia_echinata"]), "hydractinia_echinata"]
write_tsv(echinata_unassigned, "echinata_unassigned.txt")
echinata_unassigned

Assess transcriptome evidence for unassigned genes

unassigned_overlap_hsym <- read_tsv("symbio_overl_transcript")
Parsed with column specification:
cols(
  `#gene` = col_character(),
  gene_start = col_double(),
  gene_end = col_double(),
  overl_length = col_double(),
  `overl_%` = col_double(),
  transcript_start = col_double(),
  transcript_end = col_double(),
  transcript = col_character()
)
unassigned_overlap_hsym

Only keep gene/transcript overlap with the largest overlap for each gene

column_names <- colnames(unassigned_overlap_hsym)
column_names[1] <- c("gene_id")
colnames(unassigned_overlap_hsym) <- column_names
unassigned_longest_overlap_hsym <- unassigned_overlap_hsym %>% group_by(gene_id) %>% slice(which.max(`overl_%`))
mean(unassigned_longest_overlap_hsym$`overl_%`)
[1] 80.14421
Percentage of unassigned genes that overlap at 90% with transcript
100*nrow(unassigned_longest_overlap_hsym[unassigned_longest_overlap_hsym$`overl_%` >= 90, ])/nrow(unassigned_longest_overlap_hsym)
[1] 71.91781
Percentage of unassigned genes that overlap at 50% with transcript
100*nrow(unassigned_longest_overlap_hsym[unassigned_longest_overlap_hsym$`overl_%` >= 50, ])/nrow(unassigned_longest_overlap_hsym)
[1] 78.76712

Assess transcriptome evidence for all genes in genome (for comparison)

all_genes_overlap_hsym <- read_tsv("Hsym_overl_transcript")
Parsed with column specification:
cols(
  `#gene` = col_character(),
  gene_start = col_double(),
  gene_end = col_double(),
  overl_length = col_double(),
  `overl_%` = col_double(),
  transcript_start = col_double(),
  transcript_end = col_double(),
  transcript = col_character()
)
column_names <- colnames(all_genes_overlap_hsym)
column_names[1] <- c("gene_id")
column_names
[1] "gene_id"          "gene_start"       "gene_end"         "overl_length"     "overl_%"         
[6] "transcript_start" "transcript_end"   "transcript"      
colnames(all_genes_overlap_hsym) <- column_names
all_genes_overlap_hsym

Only keep gene/transcript overlap with the largest overlap for each gene

mean(all_genes_overlap_hsym$`overl_%`)
[1] 56.33483
100*nrow(all_genes_overlap_hsym[all_genes_overlap_hsym$`overl_%` >= 90, ])/nrow(all_genes_overlap_hsym)
[1] 49.98591

Same with Hydractinia echinata

unassigned_overlap_hech <- read_tsv("echinata_overl_transcript")
Parsed with column specification:
cols(
  `#gene` = col_character(),
  gene_start = col_double(),
  gene_end = col_double(),
  overl_length = col_double(),
  `overl_%` = col_double(),
  transcript_start = col_double(),
  transcript_end = col_double(),
  transcript = col_character()
)
unassigned_overlap_hech
column_names <- colnames(unassigned_overlap_hech)
column_names[1] <- c("gene_id")
colnames(unassigned_overlap_hech) <- column_names
unassigned_overlap_hech <- unassigned_overlap_hech %>% group_by(gene_id) %>% slice(which.max(`overl_%`))
mean(unassigned_overlap_hech$`overl_%`)
[1] 80.90112
100*nrow(unassigned_overlap_hech[unassigned_overlap_hech$`overl_%` >= 90, ])/nrow(unassigned_overlap_hech)
[1] 73.02417
100*nrow(unassigned_overlap_hech[unassigned_overlap_hech$`overl_%` >= 50, ])/nrow(unassigned_overlap_hech)
[1] 79.62116
all_genes_overlap_hech <- read_tsv("Hech_overl_transcript")
Parsed with column specification:
cols(
  `#gene` = col_character(),
  gene_start = col_double(),
  gene_end = col_double(),
  overl_length = col_double(),
  `overl_%` = col_double(),
  transcript_start = col_double(),
  transcript_end = col_double(),
  transcript = col_character()
)
all_genes_overlap_hech
column_names <- colnames(all_genes_overlap_hech)
column_names[1] <- c("gene_id")
colnames(all_genes_overlap_hech) <- column_names
all_genes_overlap_hech <- all_genes_overlap_hech %>% group_by(gene_id) %>% slice(which.max(`overl_%`))
mean(all_genes_overlap_hech$`overl_%`)
[1] 85.43459
100*nrow(all_genes_overlap_hech[all_genes_overlap_hech$`overl_%` >= 90, ])/nrow(all_genes_overlap_hech)
[1] 78.8996

Identify all genes with transcript evidence (for downstream analyses)

All genes with > 90% overlap:

hsym_90_overlap <- all_genes_longest_overlap_hsym[all_genes_longest_overlap_hsym$`overl_%` >= 90, ]
hsym_90_overlap <- hsym_90_overlap %>% select(gene_id) %>% mutate(overlap_90 = "yes")
hsym_90_overlap$gene_id <- paste("Hsym|", hsym_90_overlap$gene_id, sep = "" )
hsym_90_overlap

All genes with 50% overlap

hsym_50_overlap <- all_genes_longest_overlap_hsym[all_genes_longest_overlap_hsym$`overl_%` >= 50, ]
hsym_50_overlap <- hsym_50_overlap %>% select(gene_id) %>% mutate(overlap_50 = "yes")
hsym_50_overlap$gene_id <- paste("Hsym|", hsym_50_overlap$gene_id, sep = "" )
hsym_50_overlap
---
title: "Orthogroup annotation"
output: html_notebook
---

```{r}
library("tidyverse")
```

***

### Read orthogroups

```{r}
orthogroups <- read_tsv("Orthogroups_Dec_20.csv")
colnames(orthogroups)[1] <- "orthogroup_id"
orthogroups
```

***

### Read PANNZER annotations

```{r echo = T, results = 'hide'}
descriptions_original <- read_tsv("~/research/orthogroup_parsing/pannzer_annotations/combined_descriptions.tsv", col_names = TRUE, skip_empty_rows = TRUE)
descriptions <- descriptions_original %>% select(qpid, desc, genename)
colnames(descriptions) <- c("gene_id", "pannzer_description", "pannzer_gene_name")
descriptions

go_terms <- read_tsv("~/research/orthogroup_parsing/pannzer_annotations/combined_GO_terms.tsv", col_names = TRUE, skip_empty_rows = TRUE)
go_MF <- go_terms[go_terms$ARGOT_rank == 1,] %>% filter(ontology == "MF") %>% select(qpid, desc)
colnames(go_MF) <-  c("gene_id", "molecular_function")
go_BP <-go_terms[go_terms$ARGOT_rank == 1,] %>% filter(ontology == "BP") %>% select(qpid, desc)
colnames(go_BP) <-  c("gene_id", "biological_processes")
go_CC <- go_terms[go_terms$ARGOT_rank == 1,] %>% filter(ontology == "CC") %>% select(qpid, desc)
colnames(go_CC) <-  c("gene_id", "cellular_component")

annotations <- left_join(descriptions, go_MF, by = "gene_id") %>% left_join(go_BP, by = "gene_id") %>% left_join(go_CC, by = "gene_id")
annotations
remove(go_terms)
```

***

### Read diamond annotations

```{r echo = T, results = 'hide'}
diamond_annotations <-read_tsv("all_best_hits.diamond.blastp", col_names = c("gene_id", "best_BLAST_hit", "eval"))
diamond_annotations
```

***

### Read unassigned orthogroups

```{r}
unassigned_orthogroups <- read_tsv("Orthogroups_UnassignedGenes_Dec_20.csv", col_types = cols(.default = "c"))
colnames(unassigned_orthogroups)[1] <- "orthogroup_id"
unassigned_orthogroups
```

***

### Annotate all unassigned genes (all species)

```{r}
species <- "acanthaster_planci"
unassigned_orthogroups[,species]
acanthaster_unassigned <- unassigned_orthogroups[!is.na(unassigned_orthogroups$acanthaster_planci), "acanthaster_planci"]
colnames(acanthaster_unassigned) <- "gene_id"
acanthaster_unassigned
acanthaster_unassigned <- left_join(acanthaster_unassigned, annotations, by = "gene_id") %>% left_join(diamond_annotations, by = "gene_id")
```

***

#### Calculate: Number of genes with no BLAST hit, genes with uncharacterized blast hit, and others.

```{r}
nrow(acanthaster_unassigned)
sum(!is.na(acanthaster_unassigned$best_BLAST_hit))
acanthaster_unassigned[grep("uncharacterized", tolower(acanthaster_unassigned$best_BLAST_hit)), ]

```

```{r}
count_unassigned <- function(unassigned){
  species_names <- colnames(unassigned[,-1])
  print(species_names)
  output <- data.frame()
  for(species_name in species_names){
    #print(species_name)
    species_unassigned <- unassigned[!is.na(unassigned[,species_name]), species_name]
    colnames(species_unassigned) <- "gene_id"
    species_unassigned <- left_join(species_unassigned, annotations, by = "gene_id") %>% left_join(diamond_annotations, by = "gene_id")
    unassigned_num <- nrow(species_unassigned)
    no_blast <- sum(is.na(species_unassigned$best_BLAST_hit))
    percent_no_blast <- round(100*no_blast/unassigned_num, digits = 1)
    blast_hit_uncharacterized <- nrow(species_unassigned[grep("uncharacterized", tolower(species_unassigned$best_BLAST_hit)), ])
    blast_hit_hypothetical <- nrow(species_unassigned[grep("hypothetical", tolower(species_unassigned$best_BLAST_hit)), ])
    uncharacterized = sum(blast_hit_uncharacterized,blast_hit_hypothetical )
    percent_uncharacterized <- round(100*uncharacterized/unassigned_num, digits = 1)
    characterized <- unassigned_num - (no_blast + uncharacterized)
    percent_characterized <- round(100*characterized/unassigned_num, digits = 1)
    df <- data.frame(species = species_name, unassigned_genes = unassigned_num, no_blast_hit = no_blast, uncharacterized_blast_hit = uncharacterized, characterized_blast_hit = characterized,
                     percent_no_blast_hit = percent_no_blast, percent_uncharacterized = percent_uncharacterized, percent_characterized = percent_characterized)
    output <- rbind(output, df)
  }
  output
}
```

```{r}
unassigned_stats <- count_unassigned(unassigned_orthogroups)
unassigned_stats
write_tsv(unassigned_stats, path = "unassigned_stats.tsv")
```

#### Who are the species with > 50 % characterized unassigned genes.

```{r}
unassigned_stats[unassigned_stats$percent_characterized >= 50,]
```



### Look at species with a lot of characterized blast hits

```{r}
danio <- unassigned_orthogroups[!is.na(unassigned_orthogroups[,"danio_rerio"]), "danio_rerio"]
colnames(danio) <- "gene_id"
danio_annot <- left_join(danio, annotations, by = "gene_id") %>% left_join(diamond_annotations, by = "gene_id")

```

```{r}
danio_annot
```

#### Output hydractinia unassigned genes

```{r}
symbio_unassigned <- unassigned_orthogroups[!is.na(unassigned_orthogroups[,"hydractinia_symbiolongicarpus"]), "hydractinia_symbiolongicarpus"]
write_tsv(symbio_unassigned, "symbio_unassigned.txt")
symbio_unassigned
```


```{r}
colnames(symbio_unassigned) <- "gene_id"
symbio_unassigned_annot <- left_join(symbio_unassigned, annotations, by = "gene_id") %>% left_join(diamond_annotations, by = "gene_id")
symbio_unassigned_annot
```


```{r}
echinata_unassigned <- unassigned_orthogroups[!is.na(unassigned_orthogroups[,"hydractinia_echinata"]), "hydractinia_echinata"]
write_tsv(echinata_unassigned, "echinata_unassigned.txt")
echinata_unassigned
```

### Assess transcriptome evidence for unassigned genes

```{r}
unassigned_overlap_hsym <- read_tsv("symbio_overl_transcript")
unassigned_overlap_hsym
```

#### Only keep gene/transcript overlap with the largest overlap for each gene

```{r}
column_names <- colnames(unassigned_overlap_hsym)
column_names[1] <- c("gene_id")
colnames(unassigned_overlap_hsym) <- column_names
unassigned_longest_overlap_hsym <- unassigned_overlap_hsym %>% group_by(gene_id) %>% slice(which.max(`overl_%`))
```

```{r}
unassigned_longest_overlap_hsym
```


```{r}
mean(unassigned_longest_overlap_hsym$`overl_%`)
```

##### Percentage of unassigned genes that overlap at 90% with transcript

```{r}
100*nrow(unassigned_longest_overlap_hsym[unassigned_longest_overlap_hsym$`overl_%` >= 90, ])/nrow(unassigned_longest_overlap_hsym)
```

##### Percentage of unassigned genes that overlap at 50% with transcript

```{r}
100*nrow(unassigned_longest_overlap_hsym[unassigned_longest_overlap_hsym$`overl_%` >= 50, ])/nrow(unassigned_longest_overlap_hsym)
```

### Assess transcriptome evidence for all genes in genome (for comparison)


```{r}
all_genes_overlap_hsym <- read_tsv("Hsym_overl_transcript")
column_names <- colnames(all_genes_overlap_hsym)
column_names[1] <- c("gene_id")
column_names
colnames(all_genes_overlap_hsym) <- column_names
all_genes_overlap_hsym
```


#### Only keep gene/transcript overlap with the largest overlap for each gene

```{r}
all_genes_longest_overlap_hsym <- all_genes_overlap_hsym %>% group_by(gene_id) %>% slice(which.max(`overl_%`))
all_genes_longest_overlap_hsym %>% arrange(`overl_%`)
```


```{r}
mean(all_genes_overlap_hsym$`overl_%`)
```

```{r}
100*nrow(all_genes_overlap_hsym[all_genes_overlap_hsym$`overl_%` >= 90, ])/nrow(all_genes_overlap_hsym)
```


***

#### Same with Hydractinia echinata


```{r}
unassigned_overlap_hech <- read_tsv("echinata_overl_transcript")
unassigned_overlap_hech
```




```{r}
column_names <- colnames(unassigned_overlap_hech)
column_names[1] <- c("gene_id")
colnames(unassigned_overlap_hech) <- column_names
unassigned_overlap_hech <- unassigned_overlap_hech %>% group_by(gene_id) %>% slice(which.max(`overl_%`))
```


```{r}
mean(unassigned_overlap_hech$`overl_%`)
```

```{r}
100*nrow(unassigned_overlap_hech[unassigned_overlap_hech$`overl_%` >= 90, ])/nrow(unassigned_overlap_hech)
```


```{r}
100*nrow(unassigned_overlap_hech[unassigned_overlap_hech$`overl_%` >= 50, ])/nrow(unassigned_overlap_hech)
```


```{r}
all_genes_overlap_hech <- read_tsv("Hech_overl_transcript")
all_genes_overlap_hech
```


```{r}
column_names <- colnames(all_genes_overlap_hech)
column_names[1] <- c("gene_id")
colnames(all_genes_overlap_hech) <- column_names
all_genes_overlap_hech <- all_genes_overlap_hech %>% group_by(gene_id) %>% slice(which.max(`overl_%`))
```


```{r}
mean(all_genes_overlap_hech$`overl_%`)
```

```{r}
100*nrow(all_genes_overlap_hech[all_genes_overlap_hech$`overl_%` >= 90, ])/nrow(all_genes_overlap_hech)
```



#### Identify all genes with transcript evidence (for downstream analyses)

#### All genes with > 90% overlap:

```{r}
hsym_90_overlap <- all_genes_longest_overlap_hsym[all_genes_longest_overlap_hsym$`overl_%` >= 90, ]
hsym_90_overlap <- hsym_90_overlap %>% select(gene_id) %>% mutate(overlap_90 = "yes")
hsym_90_overlap$gene_id <- paste("Hsym|", hsym_90_overlap$gene_id, sep = "" )
hsym_90_overlap
```

#### All genes with 50% overlap

```{r}
hsym_50_overlap <- all_genes_longest_overlap_hsym[all_genes_longest_overlap_hsym$`overl_%` >= 50, ]
hsym_50_overlap <- hsym_50_overlap %>% select(gene_id) %>% mutate(overlap_50 = "yes")
hsym_50_overlap$gene_id <- paste("Hsym|", hsym_50_overlap$gene_id, sep = "" )
hsym_50_overlap
```











