packages <- c("R.utils", "RColorBrewer", "ape", "assertthat", "checkmate",
              "coRdon", "corrplot", "dendextend", "devtools", "doParallel", 
              "dplyr", "factoextra", "formatR", "futile.logger", "ggplot2", 
              "grid", "gtools", "kmed", "lazyeval", "magrittr", "parallel", 
              "pheatmap", "readr", "stringr", "tibble", "tictoc", "tidyr", "vegan")
for (pkg in packages) {
  library(pkg, character.only=TRUE)
}

# bioconductor packages
library(BiocManager)
library(Biostrings)
library(ComplexHeatmap)
library(coRdon)

# everything else
library(gRodon)
library(pairwiseAdonis)
library(EcolUtils)
library(ggtree)
library(microtrait)
genomeset_results <- readRDS("data/genomeset_results.rds")

It appears microTrait’s rules for GH traits are too broad. To investigate this, we first look at the proportion of MAGs identified as decomposers according to GH gene presence. In this model, if a MAG possesses genes in any of the detected GH families, it is considered a decomposer.

# separate hmm_matrix and colnames from genomeset_results 
hmm_matrix <- genomeset_results[["hmm_matrix"]]
# hmm_colnames <- colnames(genomeset_results[["hmm_matrix"]]) %>% as.data.frame()

# selecting GH genes -> 41 total
bw_gh <- as.data.frame(cbind(hmm_matrix$id, hmm_matrix[,1792:1832])) # gh genes: "GH#"
bw_gh[,2:42] <- lapply(bw_gh[,2:42], as.numeric) # convert to numeric
bw_gh[,2:42] <- bw_gh[,2:42] - 1 # fix offset (binary)
colnames(bw_gh)[1] = "id"

# exclude GH genes not detected -> 40 total
temp.1 <- as.data.frame(cbind((colSums(bw_gh[,2:42]))))
erase.1 <- temp.1 %>% filter(V1==0) # only GH7
erase.1$row_names <- row.names(erase.1)
bw_gh <- bw_gh[, !(names(bw_gh) %in% erase.1$row_names)]

In this case, all but one of the 817 MAGs contained GH genes, so it was manually isolated.

a <- as.data.frame(rowSums(bw_gh[2:41]))
a$id <- bw_gh$id
not_him <- a %>%
  filter(a[,1] == 0) # this is where we find the one non-decomposer

# simplified table
decom_simple <- bw_gh['id'] 
decom_simple$'decomposer?' <- 'yes'
decom_simple$'decomposer?'[decom_simple$id == '3300050821_379'] <- 'no'

# pie chart
decom_simple %>%
  count(`decomposer?`) %>%
  ggplot(aes(x = '', y = n, fill = `decomposer?`)) +
  geom_col(color = 'black') +
  coord_polar(theta = "y") +
  geom_text(aes(label = n),
            color = 'white',
            position = position_stack(vjust = 0.5)) +
  ggtitle("Proportion of \"decomposer\" MAGs") +
  xlab('') + 
  ylab('') +
  theme_void()

# ggsave("images/decom_ratio.png", plot=last_plot())

Upon lookup in the MetaBAT output, the taxonomic identity of MAG 3300050821_379 was found to be the following:

Domain Archaea

| Phylum Thermoproteota

| | Class Nitrososphaeria

| | | Order Nitrososphaerales

| | | | Family Nitrosopumilaceae

| | | | | Genus TA-20

Next, we observe the distribution of detected GH gene families across all 817 MAGs:

bw_gh_counts <- sapply(bw_gh, function(column) sum(column > 0)) %>% as.data.frame()
bw_gh_counts <- bw_gh_counts[-1, , drop = FALSE]
colnames(bw_gh_counts)[1] <- "Number of MAGs"
bw_gh_counts <- bw_gh_counts %>% rownames_to_column(var = "GH family")

# graph frequency of GH genes
ggplot(bw_gh_counts, aes(x = reorder(`GH family`, `Number of MAGs`), 
                         y = `Number of MAGs`,
                         fill = `Number of MAGs`)) +
  geom_col() +
  # geom_text(aes(label = `Number of MAGs`), vjust = 0.5, hjust = 0, colour = "black",
            # size = 2) +
  labs(title = 'GH gene frequency across MAGs', x = 'GH family') +
  theme(legend.position = 'none', 
        axis.text.x = element_text(angle = 90, vjust = 0.5))

# ggsave("images/gh_distribution.png", plot=last_plot())
row_sums <- rowSums(bw_gh[,-1])
mag_by_gh <- data.frame(id = bw_gh$id, GHs = row_sums)
colnames(mag_by_gh)[1] <- "Bin ID"
colnames(mag_by_gh)[2] <- "Number of GHs"

# graph number of GHs per MAG
ggplot(mag_by_gh, aes(x = reorder(`Bin ID`, `Number of GHs`), 
                         y = `Number of GHs`,
                         fill = `Number of GHs`)) +
  geom_col() +
  # geom_text(aes(label = `Number of MAGs`), vjust = 0.5, hjust = 0, colour = "black",
            # size = 2) +
  labs(title = 'GH gene families per MAG', x = 'MAG') +
  theme(legend.position = 'none', 
        axis.text.x = element_text(angle = 90, vjust = 0.5))

From this we can infer that certain GH genes are relatively ubiquitous across genomes. In microTrait’s raw data files (found in microTrait’s GitHub repository), GH “rules” are mapped to substrates, effectively assigning each gene’s association(s) with certain complex carbohydrates. However, the level of specificity in these rules is lacking because they do not account for GH subfamilies.