Targeted enrichment and discovery of jumbo phages

Author

Jeffrey Blanchard

Published

May 4, 2023

Overview

This report is an analysis of the Barre Woods filter metagenomes from the individual assembles. Assembly, binning, quality control and classification was done on KBase https://narrative.kbase.us/narrative/145971 . The checkv and virsorter files were downloaded to this project folder.

Load Libraries

R code
# Load libraries
library(tidyverse)
library(DT)

Import Data

Import vsorter files. Filter to include just the dsDNAphages

R code
virsorter <- read_tsv("data_individual_assembly/all-ind-virsorter-final-viral-score.tsv") |> 
  filter(max_score_group == "dsDNAphage") |> 
  select(-c(dsDNAphage, ssDNA, NCLDV, lavidaviridae)) |> 
  rename("contig_id" = "seqname")
R code
checkv <- read_tsv("data_individual_assembly/all-ind-checkv_quality_summary.tsv") 

Join checkv and virsorter dataframes

R code
virsorter_checkv <- left_join(virsorter, checkv, by = "contig_id")

filter to high quality and complete

R code
virsorter_checkv_HQ <- virsorter_checkv |> 
  filter(checkv_quality == "Complete" | checkv_quality == "High-quality")
R code
# write summary file
write_tsv(virsorter_checkv, "data_individual_assembly/all-ind-virsorter_checkv.tsv") 
write_tsv(virsorter_checkv_HQ, "data_individual_assembly/all-ind-virsorter_checkv_HQ.tsv") 

Data analysis

Barchart of checkv_quality

R code
virsorter_checkv |>  
ggplot(aes(x = checkv_quality)) + 
  geom_bar() 

Table with % checkv_quality in each category

R code
virsorter_checkv |> 
  mutate(across(checkv_quality, as_factor)) |> 
  group_by(checkv_quality) |> 
  summarise(n = n()) |>
  mutate(freq = n / sum(n))
# A tibble: 6 × 3
  checkv_quality     n     freq
  <fct>          <int>    <dbl>
1 High-quality     418 0.0209  
2 Complete          13 0.000650
3 Medium-quality   506 0.0253  
4 Low-quality    12703 0.635   
5 Not-determined  6238 0.312   
6 <NA>             127 0.00635 

Histogram

R code
virsorter_checkv |> 
ggplot(aes(x = length, fill = checkv_quality)) + 
  geom_histogram(colour = "black",  binwidth=10000) +
  ggtitle("Genome size of Phage") +
  xlab("Genome size") + 
  theme(text = element_text(size = 20, color="black"))

R code
  theme(axis.text.x = element_text(angle = 90)) 
List of 1
 $ axis.text.x:List of 11
  ..$ family       : NULL
  ..$ face         : NULL
  ..$ colour       : NULL
  ..$ size         : NULL
  ..$ hjust        : NULL
  ..$ vjust        : NULL
  ..$ angle        : num 90
  ..$ lineheight   : NULL
  ..$ margin       : NULL
  ..$ debug        : NULL
  ..$ inherit.blank: logi FALSE
  ..- attr(*, "class")= chr [1:2] "element_text" "element"
 - attr(*, "class")= chr [1:2] "theme" "gg"
 - attr(*, "complete")= logi FALSE
 - attr(*, "validate")= logi TRUE
R code
virsorter_checkv_HQ |> 
ggplot(aes(x = length, fill = checkv_quality)) + 
  geom_histogram(colour = "black",  binwidth=10000) +
  ggtitle("Genome size of Phage") +
  xlab("Genome size") + 
  #theme(text = element_text(size = 20, color="black"))
  theme(axis.text.x = element_text(angle = 45))