R code
# Load libraries
library(tidyverse)
library(DT)This report is an analysis of the Barre Woods filter metagenomes from the combined assembly Assembly, binning, quality control and classification was done on KBase (https://narrative.kbase.us/narrative/145971. The checkv and virsorter files were downloaded to this project folder.
# Load libraries
library(tidyverse)
library(DT)virsorter2 identifies dsDNAphages, NCLDV, ssDNA, virophages and some rna viruses in contigs. An 0.8 cutoff score is commonly used to remove potential false positives. checkv assesses the quality of contigs.
virsorter <- read_tsv("data_combined_assembly/virsorter-final-viral-score.tsv") |>
filter(max_score_group == "dsDNAphage") |>
select(-c(dsDNAphage, ssDNA, NCLDV, lavidaviridae)) |>
rename("contig_id" = "seqname")
checkv <- read_tsv("data_combined_assembly/coa-checkv_quality_summary.tsv") virsorter_checkv <- left_join(virsorter, checkv, by = "contig_id")virsorter_checkv_HQ <- virsorter_checkv |>
filter(checkv_quality == "Complete" | checkv_quality == "High-quality")virsorter_checkv_jumbo <- virsorter_checkv |>
filter(checkv_quality == "Complete" | checkv_quality == "High-quality") |>
filter(length >= 200000)# write summary file
write_tsv(virsorter_checkv, "data_combined_assembly/coa-virsorter_checkv.tsv")
write_tsv(virsorter_checkv_HQ, "data_combined_assembly/coa-virsorter_checkv_HQ.tsv")
write_tsv(virsorter_checkv_jumbo, "data_combined_assembly/coa-virsorter_checkv_jumbo.tsv") virsorter_checkv |>
ggplot(aes(x = checkv_quality)) +
geom_bar() virsorter_checkv |>
mutate(across(checkv_quality, as_factor)) |>
group_by(checkv_quality) |>
summarise(n = n()) |>
mutate(freq = n / sum(n))# A tibble: 5 × 3
checkv_quality n freq
<fct> <int> <dbl>
1 High-quality 372 0.0166
2 Complete 16 0.000714
3 Medium-quality 450 0.0201
4 Not-determined 7551 0.337
5 Low-quality 14012 0.626
Most low quality contigs are less than 20kb
virsorter_checkv |>
ggplot(aes(x = length, fill = checkv_quality)) +
geom_histogram(colour = "black", binwidth=10000) +
ggtitle("Genome size of Phage") +
xlab("Genome size") +
theme(text = element_text(size = 20, color="black")) theme(axis.text.x = element_text(angle = 90)) List of 1
$ axis.text.x:List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : NULL
..$ vjust : NULL
..$ angle : num 90
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi FALSE
..- attr(*, "class")= chr [1:2] "element_text" "element"
- attr(*, "class")= chr [1:2] "theme" "gg"
- attr(*, "complete")= logi FALSE
- attr(*, "validate")= logi TRUE
virsorter_checkv_HQ |>
ggplot(aes(x = length, fill = checkv_quality)) +
geom_histogram(colour = "black", binwidth=10000) +
ggtitle("Genome size of Phage") +
xlab("Genome size") +
#theme(text = element_text(size = 20, color="black"))
theme(axis.text.x = element_text(angle = 45)) # read metabat files to get each contig and gtdb file
metabat <- read_tsv("data_combined_assembly/metabat_depth_contigs.tsv")
checkm_gtdb <- read_tsv("data_combined_assembly/checkm_gtdb_NEB_coassembly_2.5k.tsv") |>
select(-c(`Marker Lineage`, `# Genomes`, `# Markers`, `# Marker Sets`, `0`, `1`, `2`, `3`, `4`, `5`, `user_genome`))# split contig name based on ||
virsorter_checkv_contig <- virsorter_checkv |>
separate(`contig_id`, c("contig_id", "full-partial"), "\\|\\|")
# join with metabat
virsorter_checkv_contig_metabat <- left_join(virsorter_checkv_contig, metabat, by = c("contig_id" = "seqname"))
# join with checkm_gtdb
virsorter_checkv_contig_metabat_checkm_gtdb <- left_join(virsorter_checkv_contig_metabat, checkm_gtdb, by = c("bin"))virsorter_checkv_contig_metabat_checkm_gtdb_MQ <- virsorter_checkv_contig_metabat_checkm_gtdb |>
filter(Completeness > 50) |>
filter(Contamination < 10)
virsorter_checkv_contig_metabat_checkm_gtdb_noMQ <- virsorter_checkv_contig_metabat_checkm_gtdb |>
filter(Completeness < 50) vContact groups contigs into clusters along with phages in NCBI’s viurs reference database ### data processing
vcontact <- read_csv("data_combined_assembly/vcontact_genome_by_genome_overview.csv")
vcontact_ntw <- read_table("data_combined_assembly/coa-vcontact_c1.ntw", col_names = FALSE) virsorter_checkv_HQ_vcontact <- left_join(virsorter_checkv_HQ, vcontact, by = c("contig_id" = "Genome"))
virsorter_checkv_jumbo_vcontact <- left_join(virsorter_checkv_jumbo, vcontact, by = c("contig_id" = "Genome"))# make a vector of HQ and jumbo contigs
HQ_contigs <- pull(virsorter_checkv_HQ, contig_id)
jumbo_contigs <- pull(virsorter_checkv_jumbo_vcontact, contig_id)
# create list of low quality genomes
# There are genes in the vcontact network that are not in the merged virsorter/vcheck file
# So first make a list of vcontact genomes with contig in name then filter HQ
vcontact_contig_LQ <- vcontact |>
filter(grepl('Contig', Genome)) |>
filter(!Genome %in% HQ_contigs)
# make a vector of low quality contigs
LQ_contigs <- pull(vcontact_contig_LQ, Genome)
# filter vcontact network based on HQ genomes}
vcontact_ntw_HQ <- vcontact_ntw |>
filter(X1 %in% HQ_contigs) |>
filter(!X1 %in% LQ_contigs & !X2 %in% LQ_contigs)
# filter vcontact network based on jumbo genomes
vcontact_ntw_jumbo <- vcontact_ntw |>
filter(X1 %in% jumbo_contigs) |>
filter(!X1 %in% LQ_contigs & !X2 %in% LQ_contigs)
# filter vcontact network based on jumbo genomes and interaction score <30
vcontact_ntw_jumbo30 <- vcontact_ntw_jumbo |>
filter(X3 > 30)write_tsv(virsorter_checkv_HQ_vcontact, "data_combined_assembly/coa-virsorter_checkv_HQ_vcontact.tsv")
write_tsv(virsorter_checkv_jumbo_vcontact,"data_combined_assembly/coa-virsorter_checkv_jumbo_vcontact.tsv")
write.table(vcontact_ntw_HQ, "data_combined_assembly/vcontact_ntw_HQ.ntw", sep = " ", col.names=F, row.names=F, quote=F)
write.table(vcontact_ntw_jumbo, "data_combined_assembly/vcontact_ntw_jumbo.ntw", sep = " ", col.names=F, row.names=F, quote=F)
write.table(vcontact_ntw_jumbo30, "data_combined_assembly/vcontact_ntw_jumbo30.ntw", sep = " ", col.names=F, row.names=F, quote=F)vcontact_ntw_jumbo |>
ggplot(aes(x = X3)) +
geom_histogram(colour = "black", binwidth=10) +
xlab("Score") +
#theme(text = element_text(size = 20, color="black"))
theme(axis.text.x = element_text(angle = 45)) File > Import> Network from file Open Advanced options Select delimiter = space deselect use first column as names select col 1 as source select col 2 as target select col 3 as edge
shift then mouse drag
DRAMv_amg <- read_tsv("data_combined_assembly/DRAMv_amg_summary.tsv") |>
rename("contig_id" = "scaffold")
DRAMv_stats <- read_tsv("data_combined_assembly/DRAMv_vMAG_stats.tsv") |>
rename("contig_id" = "...1")DRAMv_stats_virsorter_checkv_HQ <- left_join(virsorter_checkv_HQ, DRAMv_stats, by = "contig_id")
DRAMv_stats_virsorter_checkv_jumbo <- left_join(virsorter_checkv_jumbo, DRAMv_stats, by = "contig_id")DRAMv_amg_virsorter_checkv <- left_join(DRAMv_amg, virsorter_checkv, by = "contig_id")DRAMv_amg_virsorter_checkv_HQ <- DRAMv_amg_virsorter_checkv |>
filter(contig_id %in% HQ_contigs)
DRAMv_amg_virsorter_checkv_jumbo <- DRAMv_amg_virsorter_checkv |>
filter(contig_id %in% jumbo_contigs)write_tsv(DRAMv_stats_virsorter_checkv_HQ, "data_combined_assembly/coa-DRAMv_stats_virsorter_checkv_HQ .tsv")
write_tsv(DRAMv_stats_virsorter_checkv_jumbo,"data_combined_assembly/coa-DRAMv_stats_virsorter_checkv_jumbo.tsv")
write_tsv(DRAMv_amg_virsorter_checkv_HQ, "data_combined_assembly/coa-DRAMv_amg_virsorter_checkv_HQ.tsv")
write_tsv(DRAMv_amg_virsorter_checkv_jumbo,"data_combined_assembly/coa-DRAMv_amg_virsorter_checkv_jumbo.tsv") DRAMv_stats_virsorter_checkv_HQ |>
ggplot(aes(x = `Gene count`, y = `potential AMG count`)) +
geom_point() DRAMv_stats_virsorter_checkv_HQ |>
filter(provirus == "No") |>
ggplot(aes(x = `Gene count`, y = `potential AMG count`)) +
geom_point() DRAMv_stats_virsorter_checkv_HQ |>
filter(provirus == "No") |>
ggplot(aes(x = `Gene count`, y = `Viral genes with host benefits`)) +
geom_point() DRAMv_stats_virsorter_checkv_HQ |>
filter(provirus == "No") |>
ggplot(aes(x = `length`, y = `Gene count`)) +
geom_point()DRAMv_stats_virsorter_checkv_HQ |>
filter(provirus == "No") |>
ggplot(aes(x = `length`, y = `Gene count`)) +
geom_point(aes(colour = `Transposase present`))virmatch <- read_csv("data_combined_assembly/coa-virmatcher.csv") |>
rename("contig_id" = "Original Viral population") |>
rename("bin" = "Original Host")
checkm_gtdb <- read_tsv("data_combined_assembly/checkm_gtdb_NEB_coassembly_2.5k.tsv")
detect_viral_bins_final_calls <- read_tsv("data_combined_assembly/detect_viral_bins_final_calls.tsv") virsorter_checkv_HQ_virmatch <- left_join(virsorter_checkv_HQ, virmatch, by = "contig_id")
virsorter_checkv_HQ_virmatch_checkm_gtdb <- left_join(virsorter_checkv_HQ_virmatch, checkm_gtdb, by = "bin") |>
filter(Completeness > 50) |>
filter(Contamination < 10)Should check to see if any HQ phage are in bins which would complicate results. Find bin associated with contig
### to map contigs to bins
metabat_bins_contigs <- read_tsv("data_combined_assembly/metabat_depth_contigs.tsv") |>
rename("contig_id" = "seqname") |>
select(-c(contigLen, totalAvgDepth, `contig_%GC`))
virsorter_checkv_HQ_virmatch_checkm_gtdb$contig_id <- str_replace_all(virsorter_checkv_HQ_virmatch_checkm_gtdb$contig_id, "full", "")
virsorter_checkv_HQ_virmatch_checkm_gtdb$contig_id <- str_replace_all(virsorter_checkv_HQ_virmatch_checkm_gtdb$contig_id, "[[|]]", "")
virsorter_checkv_HQ_virmatch_checkm_gtdb_bin <- left_join(virsorter_checkv_HQ_virmatch_checkm_gtdb, metabat_bins_contigs, by = "contig_id")
write_tsv(virsorter_checkv_HQ_virmatch_checkm_gtdb_bin, "data_combined_assembly/virsorter_checkv_HQ_virmatch_checkm_gtdb_bin.tsv") Genetic exchange shapes ultra-small Patescibacteria metabolic capacities in the terrestrial subsurface https://www.biorxiv.org/content/10.1101/2022.10.05.510940v2.full
From MAGs to riches: exploring the microbial community of activated sludge using high-quality MAGS https://www.youtube.com/watch?v=EzIN7ycK6-E They also identified 38 reduced-genome bacteria and archaea, including one Paceibacteria genome: an exciting find, as organisms from this group had not previously been characterised in situ. Using the 16S sequences, they were able to design FISH probes and visualise the bacterium; assembly of the genome also identified versions with and without an additional fragment which was revealed to represent a pha
https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.3002083