library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
This file covers the generation of various graphs and other figures to analyze the taxonomic diversity of the MAGs from Barre Woods.
BW_coassembly_GTDB_all <- read_tsv("data/metaG_coassembly_all_metabat.tsv")
## Rows: 817 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): Bin ID, Bin Quality, Bin Lineage, Domain, Phylum, Class, Order, F...
## dbl (9): Completeness, Contamination, Total Number of Bases, 5s rRNA, 16s ...
## lgl (1): ...1
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# HQ only
BW_coassembly_90_5 <- BW_coassembly_GTDB_all %>%
filter(Completeness > 90) %>%
filter(Contamination < 5)
write_tsv(BW_coassembly_90_5, "data/BW_coassembly_90_5.tsv")
# MQ only
BW_coassembly_50_10 <- BW_coassembly_GTDB_all %>%
filter(`Bin Quality` == 'MQ')
# for some reason the Bin Quality columns don't match actual completion/contamination values,
# so i'm just going to update them for this
BW_coassembly_GTDB_all <- BW_coassembly_GTDB_all %>%
mutate(
`Bin Quality` = case_when(
Completeness > 90 & Contamination < 5 ~ 'HQ',
TRUE ~ `Bin Quality`
)
)
datatable(BW_coassembly_GTDB_all %>%
group_by(Phylum) %>%
summarise(n = n()) %>%
mutate(freq = 100 * n / sum(n)) %>%
# mutate(freq = n / sum(n)) %>%
mutate_if(is.numeric, round, 1)
)
datatable(BW_coassembly_GTDB_all %>%
group_by(Class) %>%
summarise(n = n()) %>%
mutate(freq = 100 * n / sum(n)) %>%
mutate_if(is.numeric, round, 1)
)
datatable(BW_coassembly_90_5 %>%
group_by(Phylum) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n)) %>%
arrange(-n)
)
BW_coassembly_GTDB_all %>%
ggplot(aes(x = `Total Number of Bases`)) +
geom_histogram(colour = "black", fill = "maroon", binwidth=500000) +
ggtitle("Genome size of MAGs") +
ylab("Genome size") +
theme(text = element_text(size = 20, color="black")) +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
# theme(axis.text.x = element_text(angle = 90))
# ggsave("images/mags_size.png", plot=last_plot())
# get r^2
gen_size <- lm(BW_coassembly_GTDB_all$`Total Number of Bases` ~ BW_coassembly_GTDB_all$`Gene Count`)
rsq <- round(summary(gen_size)$r.squared, 4)
BW_coassembly_GTDB_all %>%
ggplot(aes(x = `Total Number of Bases`, y = `Gene Count`)) +
geom_point(color = 'springgreen3', alpha = 0.30) +
ggtitle("Genome size vs. gene count per MAG") +
xlab("MAG size (bp)") +
geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 0.2) +
annotate("text", x = max(BW_coassembly_GTDB_all$`Total Number of Bases`),
y = max(BW_coassembly_GTDB_all$`Gene Count`),
label = bquote(R^2 == .(rsq)),
hjust = 1.1, vjust = 1.1) +
theme(text = element_text(size = 15, color="black")) +
theme(plot.margin = margin(1, 2, 1, 1, "cm"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning in is.na(x): is.na() applied to non-(list or vector) of type 'language'
# ggsave("images/base_vs_gene_count.png", plot=last_plot())
BW_coassembly_GTDB_all %>%
ggplot(aes(x = reorder(Phylum, `Total Number of Bases`), y = `Total Number of Bases`, color = Domain)) +
geom_point() +
ggtitle("Genome size of MAGs for each Phylum") +
ylab("MAG size (bp)") +
xlab("Phylum") +
coord_flip()
# get min/max vals
BW_coassembly_GTDB_all %>%
group_by(Phylum) %>%
summarize(
min = min(`Total Number of Bases`),
max = max(`Total Number of Bases`)
)
## # A tibble: 21 × 3
## Phylum min max
## <chr> <dbl> <dbl>
## 1 Acidobacteriota 1365391 8278539
## 2 Actinobacteriota 1173425 8244737
## 3 Bacteroidota 1806214 6162674
## 4 Chloroflexota 1543268 4968490
## 5 Dependentiae 704041 1240586
## 6 Desulfobacterota_B 2364161 5018362
## 7 Dormibacterota 2339793 7165257
## 8 Eremiobacterota 1418693 3018426
## 9 FCPU426 1830931 2732067
## 10 Gemmatimonadota 1845447 5049196
## # ℹ 11 more rows
# ggsave("images/genome_size_by_phylum.png", plot=last_plot())
BW_coassembly_GTDB_all %>%
filter(!is.na(Class)) %>%
ggplot(aes(x = reorder(Class, `Total Number of Bases`), y = `Total Number of Bases`, color = Domain)) +
geom_point() +
ggtitle("Genome size of MAGs for each Class") +
ylab("MAG size (bp)") +
xlab("Class") +
coord_flip()
# ggsave("images/genome_size_by_class.png", plot=last_plot())
genome_range_by_phylum <- BW_coassembly_GTDB_all %>% group_by(Phylum) %>%
summarise(min = min(`Total Number of Bases`),
max = max(`Total Number of Bases`),
avg = mean(`Total Number of Bases`))
BW_coassembly_GTDB_all %>%
ggplot(aes(x = `Total Number of Bases`, y = Phylum)) +
ggtitle("Genome sizes per phylum") +
geom_boxplot(fill = "springgreen3") +
theme(plot.margin = margin(1, 1, 1, 1, "cm"))
# ggsave("images/genome_size_range.png", plot=last_plot())
BW_coassembly_GTDB_all %>%
ggplot(aes(x = `Completeness`, fill = `Bin Quality`)) +
geom_histogram(colour = "black", binwidth=5) +
ggtitle("Completeness of MAGs") +
ylab("Number of MAGs") +
xlab("Completeness (%)") +
theme(text = element_text(size = 20, color="black"))
# vals
BW_coassembly_GTDB_all %>%
summarize(
min = min(Completeness),
max = max(Completeness),
avg = mean(Completeness)
)
## # A tibble: 1 × 3
## min max avg
## <dbl> <dbl> <dbl>
## 1 50 100 73.8
BW_coassembly_90_5 %>%
summarize(
min = min(Completeness),
max = max(Completeness),
avg = mean(Completeness)
)
## # A tibble: 1 × 3
## min max avg
## <dbl> <dbl> <dbl>
## 1 90.0 100 94.6
BW_coassembly_50_10 %>%
summarize(
min = min(Completeness),
max = max(Completeness),
avg = mean(Completeness)
)
## # A tibble: 1 × 3
## min max avg
## <dbl> <dbl> <dbl>
## 1 50 100 72.7
# ggsave("images/bin_completeness.png", plot=last_plot())