packages <- c("R.utils", "RColorBrewer", "ape", "assertthat", "checkmate",
              "coRdon", "corrplot", "dendextend", "DESeq2", "devtools", "doParallel", 
              "dplyr", "factoextra", "formatR", "futile.logger", "ggplot2", 
              "grid", "gtools", "kmed", "lazyeval", "magrittr", "parallel", 
              "pheatmap", "readr", "stringr", "tibble", "tictoc", "tidyr", "vegan")
for (pkg in packages) {
  library(pkg, character.only=TRUE)
}

# bioconductor packages
library(BiocManager)
library(coRdon)

# everything else
library(gRodon)
library(EcolUtils)
library(ggtree)
library(microtrait)

load("data/traits.RData")

This file goes over the differential abundance analysis that was performed on the MAGs. Thank you to Jared Gracia-David ’23 for providing the DESeq code, from which I was able to generate the figures.

1 Running DESeq

host_counts <- readRDS("data/bin_counts.rds")
# round to int
host_counts <- host_counts %>%
  mutate_if(is.numeric, ~round(., 0))

# organic
host_counts_o <- host_counts %>%
  select(starts_with('bin') | ends_with("O"))

# mineral
host_counts_m <- host_counts %>%
  select(starts_with('bin') | ends_with("M"))

# set row names
hc <- as.matrix(host_counts[,-1])
row.names(hc) <- host_counts$bin
hc_o <- as.matrix(host_counts_o[,-1])
row.names(hc_o) <- host_counts_o$bin
hc_m <- as.matrix(host_counts_m[,-1])
row.names(hc_m) <- host_counts_m$bin

#Compile coldata
sample_mapping <- read_csv("data/Sample_mapping.csv")

## Rows: 28 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): file, Sample, Treatment, Layer
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

sample_mapping$Treatment <- as.factor(sample_mapping$Treatment)
sample_mapping$Layer <- as.factor(sample_mapping$Layer)

#Treatment: heated vs control - plus separate analysis by layer
coldata <- sample_mapping %>%
  select("Sample", "Treatment")

coldata_o <- sample_mapping %>%
  filter(Layer == "Organic") %>%
  select("Sample", "Treatment")

coldata_m <- sample_mapping %>%
  filter(Layer == "Mineral") %>%
  select("Sample", "Treatment")

#Run DESeq
dds <- DESeqDataSetFromMatrix(countData = hc,
                              colData = coldata,
                              design = ~ Treatment)

## converting counts to integer mode

dds$Treatment = relevel(dds$Treatment, ref="Control")
dds <- DESeq(dds)

## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 61 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

res <- results(dds, tidy=TRUE)

# Add a column to the data frame to specify if they are UP- or DOWN- regulated 
#(log2fc respectively positive or negative)
# from https://biostatsquid.com/volcano-plots-r-tutorial/
res$diffexpressed <- 'NO'
res$diffexpressed[res$log2FoldChange > 0.6 & res$padj < 0.05] <- 'UP'
res$diffexpressed[res$log2FoldChange < -0.6 & res$padj < 0.05] <- 'DOWN'

res.sig <- res %>%
  filter(padj<0.05)
bw_class <- bw_gtdb_tax %>% select(id, Class)
colnames(bw_class)[1] <- 'row'
res.sig <- merge(res.sig, bw_class)

2 Table of significant results

library(DT)
datatable(res.sig %>%
            select(row, baseMean, log2FoldChange, padj, Class) %>%
            mutate_if(is.numeric, round, 5)
)

3 Volcano plot - All MAGs

ggplot(data = res, aes(x = log2FoldChange, y = -log10(padj), color = diffexpressed)) +
  geom_vline(xintercept = c(-0.6, 0.6), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.05), col = "gray", linetype = 'dashed') +
  ggtitle("Differential abundances of all MAGs based on treatment (all layers)") +
  xlab(expression("log"[2]*"FC")) +
  ylab(expression("-log"[10]*"adjusted p-value")) +
  scale_color_manual(name = "Change in abundance",
                     values = c("#f8766dff", "grey", "#00bfc4ff"),
                     labels = c("Decreased", "Not significant", "Increased")) +
  theme(legend.position = "bottom") +
  geom_point()

## Warning: Removed 433 rows containing missing values (`geom_point()`).

As shown, 433 MAGs were missing adjusted p-values (due to low base mean counts), so they were excluded.

4 Volcano plot - Decomposer MAGs

4.1 Divide results into clusters and add cluster labels

# select decomposer mags
res.decomp <- res[res$row %in% bw_gh_transport$id, ]
res.decomp.sig <- res.decomp %>%
  filter(padj<0.05)

# version without taxa for now
bw_gh_transport_cl_2 <- bw_gh_transport[, 1:37]
row.names(bw_gh_transport_cl_2) <- bw_gh_transport$id
bw_gh_transport_cl_2 <- bw_gh_transport_cl_2[, !(names(bw_gh_transport_cl_2) %in% c('id'))]
dist.1_2 <- vegdist(bw_gh_transport_cl_2, method = "jaccard", binary = TRUE)
clust.1_2 <- hclust(dist.1_2, method="ward.D2")
row_dend_2 = as.dendrogram(clust.1_2)
# get assignments
cluster_assignments <- dendextend::cutree(row_dend_2, k = 20) %>% as.data.frame()
colnames(cluster_assignments)[1] <- 'cluster'
cluster_assignments <- rownames_to_column(cluster_assignments, var = "row")
# match by id
res_clustered <- merge(cluster_assignments, res.decomp)

# label only significant for figure
res_clustered$cluster <- ifelse(res_clustered$padj < 0.05, res_clustered$cluster, NA)

4.2 Plot results

ggplot(data = res_clustered, aes(x = log2FoldChange, y = -log10(padj))) +
  geom_vline(xintercept = c(-0.6, 0.6), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.05), col = "gray", linetype = 'dashed') +
  ggtitle("Differential abundances based on treatment, by cluster (all layers)") +
  xlab(expression("log"[2]*"FC")) +
  ylab(expression("-log"[10]*"adjusted p-value")) +
  scale_color_discrete(name = "Cluster") +
  theme(legend.position = "bottom", legend.direction = "horizontal") +
  geom_point(aes(color = factor(cluster))) +
  geom_text(aes(label = cluster), nudge_y = 0.1, size = 2)

## Warning: Removed 413 rows containing missing values (`geom_point()`).

## Warning: Removed 768 rows containing missing values (`geom_text()`).

Here, 413 rows were missing adjusted p-values, so they were removed.

4.3 Organic vs. mineral plots

The following plots were not included in the main text but remain here for access. To generate these plots, the hc and coldata arguments in the DESeqDataSetFromMatrix() method are replaced with their organic (_o) and mineral (_m) variants.

Differential abundances of all MAGs, organic layer

Differential abundances of all MAGs, mineral layer

Differential abundances of ‘decomposer’ MAGs, organic layer

Differential abundances of ‘decomposer’ MAGs, mineral layer

Differential abundance analysis

Seojin Jung

2024-04-16