library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

This file covers the generation of various graphs and other figures to analyze the taxonomic diversity of the MAGs from Barre Woods.

1 Extract data from TSV

BW_coassembly_GTDB_all <- read_tsv("data/metaG_coassembly_all_metabat.tsv")
## Rows: 817 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (11): Bin ID, Bin Quality, Bin Lineage, Domain, Phylum, Class, Order, F...
## dbl   (9): Completeness, Contamination, Total Number of Bases, 5s rRNA, 16s ...
## lgl   (1): ...1
## date  (1): Date Added
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# HQ only
BW_coassembly_90_5 <- BW_coassembly_GTDB_all %>%
  filter(Completeness > 90) %>% 
  filter(Contamination < 5) 
  write_tsv(BW_coassembly_90_5, "data/BW_coassembly_90_5.tsv")
  
# MQ only
BW_coassembly_50_10 <- BW_coassembly_GTDB_all %>%
  filter(`Bin Quality` == 'MQ')

# for some reason the Bin Quality columns don't match actual completion/contamination values,
# so i'm just going to update them for this
BW_coassembly_GTDB_all <- BW_coassembly_GTDB_all %>% 
  mutate(
    `Bin Quality` = case_when(
      Completeness > 90 & Contamination < 5 ~ 'HQ',
      TRUE ~ `Bin Quality`
    )
  )

2 Tables

2.1 Table of Phylum Counts

datatable(BW_coassembly_GTDB_all %>% 
  group_by(Phylum) %>% 
  summarise(n = n()) %>%
  mutate(freq = 100 * n / sum(n)) %>% 
  # mutate(freq = n / sum(n)) %>% 
  mutate_if(is.numeric, round, 1)  
)

2.2 Table of Class counts

datatable(BW_coassembly_GTDB_all %>% 
  group_by(Class) %>% 
  summarise(n = n()) %>%
  mutate(freq = 100 * n / sum(n)) %>% 
  mutate_if(is.numeric, round, 1)  
)

2.3 Table of High Quality MAGs

datatable(BW_coassembly_90_5 %>% 
  group_by(Phylum) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n)) %>% 
  arrange(-n)
)

3 Counts

3.1 Number of Archaea vs. Bacteria

3.2 Phyla MAG count bar chart

3.3 Class MAG count bar chart

3.4 Novel genus count per phylum bar chart

3.5 Novel Families

3.6 Novel Orders

3.7 Novel Classes

4 Genome sizes

4.1 Histogram of genome sizes

BW_coassembly_GTDB_all %>% 
ggplot(aes(x = `Total Number of Bases`)) + 
  geom_histogram(colour = "black", fill = "maroon", binwidth=500000) +
  ggtitle("Genome size of MAGs") +
  ylab("Genome size") + 
  theme(text = element_text(size = 20, color="black")) +
  theme(plot.margin = margin(1, 1, 1, 1, "cm"))

#  theme(axis.text.x = element_text(angle = 90)) 

# ggsave("images/mags_size.png", plot=last_plot())

4.2 Total number of bases vs. gene count for each bin

# get r^2
gen_size <- lm(BW_coassembly_GTDB_all$`Total Number of Bases` ~ BW_coassembly_GTDB_all$`Gene Count`)
rsq <- round(summary(gen_size)$r.squared, 4)

BW_coassembly_GTDB_all %>% 
ggplot(aes(x = `Total Number of Bases`, y = `Gene Count`)) + 
  geom_point(color = 'springgreen3', alpha = 0.30) + 
  ggtitle("Genome size vs. gene count per MAG") +
  xlab("MAG size (bp)") +
  geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 0.2) +
  annotate("text", x = max(BW_coassembly_GTDB_all$`Total Number of Bases`), 
           y = max(BW_coassembly_GTDB_all$`Gene Count`), 
           label = bquote(R^2 == .(rsq)), 
           hjust = 1.1, vjust = 1.1) +
  theme(text = element_text(size = 15, color="black")) +
  theme(plot.margin = margin(1, 2, 1, 1, "cm"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning in is.na(x): is.na() applied to non-(list or vector) of type 'language'

# ggsave("images/base_vs_gene_count.png", plot=last_plot())

4.3 Genome size by phylum

BW_coassembly_GTDB_all %>% 
  ggplot(aes(x = reorder(Phylum, `Total Number of Bases`), y = `Total Number of Bases`, color = Domain)) + 
  geom_point() +
  ggtitle("Genome size of MAGs for each Phylum") +
  ylab("MAG size (bp)") + 
  xlab("Phylum") +
  coord_flip()

# get min/max vals
BW_coassembly_GTDB_all %>%
  group_by(Phylum) %>%
  summarize(
    min = min(`Total Number of Bases`),
    max = max(`Total Number of Bases`)
  )
## # A tibble: 21 × 3
##    Phylum                 min     max
##    <chr>                <dbl>   <dbl>
##  1 Acidobacteriota    1365391 8278539
##  2 Actinobacteriota   1173425 8244737
##  3 Bacteroidota       1806214 6162674
##  4 Chloroflexota      1543268 4968490
##  5 Dependentiae        704041 1240586
##  6 Desulfobacterota_B 2364161 5018362
##  7 Dormibacterota     2339793 7165257
##  8 Eremiobacterota    1418693 3018426
##  9 FCPU426            1830931 2732067
## 10 Gemmatimonadota    1845447 5049196
## # ℹ 11 more rows
# ggsave("images/genome_size_by_phylum.png", plot=last_plot())

4.4 Genome size by class

BW_coassembly_GTDB_all %>% 
  filter(!is.na(Class)) %>%
  ggplot(aes(x = reorder(Class, `Total Number of Bases`), y = `Total Number of Bases`, color = Domain)) + 
  geom_point() +
  ggtitle("Genome size of MAGs for each Class") +
  ylab("MAG size (bp)") + 
  xlab("Class") +
  coord_flip()

# ggsave("images/genome_size_by_class.png", plot=last_plot())

4.5 Range of genome sizes per phylum

genome_range_by_phylum <- BW_coassembly_GTDB_all %>% group_by(Phylum) %>%
  summarise(min = min(`Total Number of Bases`),
            max = max(`Total Number of Bases`),
            avg = mean(`Total Number of Bases`))

BW_coassembly_GTDB_all %>% 
  ggplot(aes(x = `Total Number of Bases`, y = Phylum)) + 
  ggtitle("Genome sizes per phylum") + 
  geom_boxplot(fill = "springgreen3") + 
  theme(plot.margin = margin(1, 1, 1, 1, "cm"))

# ggsave("images/genome_size_range.png", plot=last_plot())

5 Completeness

5.1 Completeness by bin

BW_coassembly_GTDB_all %>% 
ggplot(aes(x = `Completeness`, fill = `Bin Quality`)) + 
  geom_histogram(colour = "black", binwidth=5) +
  ggtitle("Completeness of MAGs") +
  ylab("Number of MAGs") + 
  xlab("Completeness (%)") +
  theme(text = element_text(size = 20, color="black")) 

# vals
BW_coassembly_GTDB_all %>%
  summarize(
    min = min(Completeness),
    max = max(Completeness),
    avg = mean(Completeness)
  )
## # A tibble: 1 × 3
##     min   max   avg
##   <dbl> <dbl> <dbl>
## 1    50   100  73.8
BW_coassembly_90_5 %>%
    summarize(
    min = min(Completeness),
    max = max(Completeness),
    avg = mean(Completeness)
  )
## # A tibble: 1 × 3
##     min   max   avg
##   <dbl> <dbl> <dbl>
## 1  90.0   100  94.6
BW_coassembly_50_10 %>%
  summarize(
    min = min(Completeness),
    max = max(Completeness),
    avg = mean(Completeness)
  )
## # A tibble: 1 × 3
##     min   max   avg
##   <dbl> <dbl> <dbl>
## 1    50   100  72.7
# ggsave("images/bin_completeness.png", plot=last_plot())