Skip to content

Commit

Permalink
Created a script to adadpt new mgs-workflow code to create newest ver…
Browse files Browse the repository at this point in the history
…sion of hv-clade-counts.tsv, which creates correct (and slightly lower) n_reads_clade counts/
  • Loading branch information
simonleandergrimm committed Jul 9, 2024
1 parent 9e69245 commit f7aaddf
Show file tree
Hide file tree
Showing 3 changed files with 265 additions and 0 deletions.
38 changes: 38 additions & 0 deletions scripts/collapseHV.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env Rscript

library(tidyverse)

args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 2) {
stop("Usage: Rscript collapseHV-main.R <hv_hits_filtered> <hv_hits_collapsed>")
}

hv_hits_filtered <- args[1]
hv_hits_collapsed <- args[2]

rmax <- function(x){
if (all(is.na(x))) return(NA)
return(max(x, na.rm = TRUE))}

collapse <- function(x) ifelse(all(x == x[1]), x[1], paste(x, collapse="/"))

hits_filtered <- read_tsv(hv_hits_filtered, col_names = TRUE, show_col_types = FALSE)
print(dim(hits_filtered))

reads_collapsed <- hits_filtered %>% group_by(seq_id) %>% summarize(
sample = collapse(sample), genome_id = collapse(genome_id),
taxid_best = taxid[1], taxid = collapse(as.character(taxid)),
best_alignment_score_fwd = rmax(best_alignment_score_fwd),
best_alignment_score_rev = rmax(best_alignment_score_rev),
query_len_fwd = rmax(query_len_fwd), query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1],
query_len_rev = rmax(query_len_rev), query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1],
classified = rmax(classified), assigned_name = collapse(assigned_name),
assigned_taxid_best = assigned_taxid[1], assigned_taxid = collapse(as.character(assigned_taxid)),
assigned_hv = rmax(assigned_hv), hit_hv = rmax(hit_hv), encoded_hits = collapse(encoded_hits),
adj_score_fwd = rmax(adj_score_fwd), adj_score_rev = rmax(adj_score_rev)
) %>% mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev))

print(dim(reads_collapsed))

# Write output
write_tsv(reads_collapsed, hv_hits_collapsed)
121 changes: 121 additions & 0 deletions scripts/count-viral-taxa.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env Rscript

library(optparse)
library(tidyverse)

# Set arguments
option_list = list(
make_option(c("--reads"), type="character", default=NULL,
help="Path to TSV file of human-infecting virus read data."),
make_option(c("--taxa"), type="character", default=NULL,
help="Path to database of viral taxonomic relationships"),
make_option(c("--output"), type="character", default=NULL,
help="Path to output file.")
)
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

# Set input and output paths
read_db_path <- opt$reads
taxa_db_path <- opt$taxa
out_path <- opt$output

#=====================#
# AUXILIARY FUNCTIONS #
#=====================#

count_children <- function(taxid_tree){
# Count number of direct children of each node in a taxid tree
n_children <- taxid_tree %>% group_by(parent_taxid) %>%
count(name="n_children") %>% rename(taxid = parent_taxid)
db_out <- taxid_tree %>% left_join(n_children, by="taxid") %>%
mutate(n_children = replace_na(n_children, 0))
return(db_out)
}

count_hits_direct <- function(read_db, taxid_tree, group_var){
# Count number of reads directly assigned to each taxid in a tree
taxids <- count_children(taxid_tree)
direct_hits_setup <- read_db %>% group_by(taxid, .data[[group_var]]) %>%
count(name="n_reads_direct", .drop=FALSE) %>%
pivot_wider(names_from=any_of(group_var), values_from=n_reads_direct,
values_fill = 0)
direct_hits_joined <- taxids %>% left_join(direct_hits_setup, by="taxid")
direct_hits <- taxids %>% left_join(direct_hits_setup, by="taxid") %>%
pivot_longer(cols=-(taxid:n_children), names_to=group_var,
values_to="n_reads_direct") %>%
mutate(n_reads_direct = replace_na(n_reads_direct, 0))
# Double pivot ensures all viruses have a count for all samples
return(direct_hits %>% select(-n_children))
}

count_hits_indirect <- function(direct_hits, taxid_tree, group_var){
# Count number of reads assigned to each taxid in a tree or its descendents
iter <- 0
taxids <- count_children(taxid_tree)
n_children <- taxids %>% select(taxid, n_children)
hits_unresolved <- direct_hits %>% left_join(n_children, by="taxid") %>%
mutate(n_reads_clade = n_reads_direct, n_reads_children = 0)
hits_resolved <- tibble()
cat("\tIteration ", iter, ": ", nrow(hits_unresolved), " unresolved, ", nrow(hits_resolved), " resolved.\n", sep="")
while (nrow(hits_unresolved) > 0){
iter <- iter + 1
# Identify leaf nodes
hits_leaf <- hits_unresolved %>% filter(n_children == 0)
# Calculate contributions to parent nodes
hits_children <- hits_leaf %>% group_by(parent_taxid, .data[[group_var]]) %>%
summarize(n_reads_children = sum(n_reads_clade), .groups = "drop") %>%
rename(taxid = parent_taxid)
# Add child reads to direct hits to calculate clade reads
hits_partial <- hits_unresolved %>% filter(n_children > 0) %>%
select(-n_children, -n_reads_children) %>%
left_join(hits_children, by=c("taxid", group_var)) %>%
mutate(n_reads_children = replace_na(n_reads_children, 0),
n_reads_clade = n_reads_clade + n_reads_children)
# Add leaf nodes to resolved hits
hits_resolved <- hits_resolved %>% bind_rows(hits_leaf) %>%
select(-n_children, -n_reads_children)
# Recalculate child counts after removing leaf nodes
hits_n_children <- hits_partial %>% group_by(parent_taxid, .data[[group_var]]) %>%
count(name="n_children") %>% rename(taxid=parent_taxid)
hits_unresolved <- hits_partial %>%
left_join(hits_n_children, by=c("taxid", group_var)) %>%
mutate(n_children = replace_na(n_children, 0))
cat("\tIteration ", iter, ": ", nrow(hits_unresolved), " unresolved, ", nrow(hits_resolved), " resolved.\n", sep="")
}
return(hits_resolved %>% arrange(taxid))
}

count_hits <- function(read_db, taxid_tree, group_var){
# Count direct and indirect hits for each taxon in a set of human-viral reads
cat("Counting direct hits...\n")
hits_direct <- count_hits_direct(read_db, taxid_tree, group_var)
cat("Done.\n")
cat("Counting indirect hits...\n")
hits_indirect <- count_hits_indirect(hits_direct, taxid_tree, group_var)
cat("Done.\n")
return(hits_indirect)
}

#============#
# RUN SCRIPT #
#============#

# Import data
read_db <- read_tsv(read_db_path, show_col_types = FALSE) %>% mutate(taxid = as.integer(taxid))
taxa_db <- read_tsv(taxa_db_path, show_col_types = FALSE) %>% mutate(taxid = as.integer(taxid))

if (nrow(read_db) > 0){
# Count hits
hits_db <- count_hits(read_db, taxa_db, "sample")
# Filter out null rows
hits_db_filtered <- filter(hits_db, n_reads_clade > 0)
} else {
hits_db_filtered <- tibble(
taxid = numeric(), name = character(), rank = character(), parent_taxid = numeric(),
sample = character(), n_reads_direct = numeric(), n_reads_clade = numeric()
)
}

# Write output
write_tsv(hits_db_filtered, out_path)
106 changes: 106 additions & 0 deletions scripts/create_hv_clade_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/usr/bin/env python3

import json
import os
import subprocess
from pathlib import Path

import matplotlib.pyplot as plt # type: ignore
import matplotlib.ticker as ticker # type: ignore
import numpy as np
import pandas as pd
import seaborn as sns # type: ignore
from matplotlib.gridspec import GridSpec # type: ignore
from collections import defaultdict

if not os.path.exists("../bioprojects"):
os.mkdir("../bioprojects")


TARGET_STUDY_METADATA = {
"Bengtsson-Palme 2016": ["PRJEB14051"],
"Munk 2022": [
"PRJEB13831",
"PRJEB27054",
"PRJEB27621",
"PRJEB40798",
"PRJEB40815",
"PRJEB40816",
"PRJEB51229",
],
"Brinch 2020": ["PRJEB13832", "PRJEB34633"],
"Ng 2019": ["PRJNA438174"],
"Maritz 2019": ["PRJEB28033"],
"Brumfield 2022": ["PRJNA812772"],
"Yang 2020": ["PRJNA645711"],
"Spurbeck 2023": ["PRJNA924011"],
"CC 2021": ["PRJNA661613"],
"Rothman 2021": ["PRJNA729801"], # not yet run through the pipeline
}


for study, bioprojects in TARGET_STUDY_METADATA.items():
for bioproject in bioprojects:
study_author = study.split()[0]

if not os.path.exists(
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv"
):
subprocess.run(
[
"aws",
"s3",
"cp",
f"s3://nao-mgs-wb/{study_author}-{bioproject}/output/hv_hits_putative_filtered.tsv.gz",
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv.gz",
]
)
subprocess.run(
[
"gunzip",
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv.gz",
]
)
if not os.path.exists(
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_collapsed.tsv"
):
subprocess.run(
[
"Rscript",
"collapseHV.R",
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv",
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_collapsed.tsv",
]
)
if not os.path.exists(f"../taxonomy/viral-taxids.tsv"):
subprocess.run(
[
"aws",
"s3",
"cp",
"s3://nao-mgs-wb/ref/results/viral-taxids.tsv.gz",
"../taxonomy/viral-taxids.tsv.gz",
]
)
subprocess.run(
[
"gunzip",
"../taxonomy/viral-taxids.tsv.gz",
]
)

if not os.path.exists(
f"../bioprojects/{study_author}-{bioproject}/hv_clade_counts_new.tsv"
):
subprocess.run(
[
"Rscript",
"count-viral-taxa.R",
"--reads",
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_collapsed.tsv",
"--taxa",
"../taxonomy/viral-taxids.tsv",
"--output",
f"../bioprojects/{study_author}-{bioproject}/hv_clade_counts_new.tsv",
]
)

0 comments on commit f7aaddf

Please sign in to comment.