Created a script to adadpt new mgs-workflow code to create newest ver…

…sion of hv-clade-counts.tsv, which creates correct (and slightly lower) n_reads_clade counts/
naobservatory · Jul 9, 2024 · f7aaddf · f7aaddf
1 parent 9e69245
commit f7aaddf
Show file tree

Hide file tree

Showing 3 changed files with 265 additions and 0 deletions.
diff --git a/scripts/collapseHV.R b/scripts/collapseHV.R
@@ -0,0 +1,38 @@
+#!/usr/bin/env Rscript
+
+library(tidyverse)
+
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) != 2) {
+  stop("Usage: Rscript collapseHV-main.R <hv_hits_filtered> <hv_hits_collapsed>")
+}
+
+hv_hits_filtered <- args[1]
+hv_hits_collapsed <- args[2]
+
+rmax <- function(x){
+    if (all(is.na(x))) return(NA)
+    return(max(x, na.rm = TRUE))}
+
+collapse <- function(x) ifelse(all(x == x[1]), x[1], paste(x, collapse="/"))
+
+hits_filtered <- read_tsv(hv_hits_filtered, col_names = TRUE, show_col_types = FALSE)
+print(dim(hits_filtered))
+
+reads_collapsed <- hits_filtered %>% group_by(seq_id) %>% summarize(
+    sample = collapse(sample), genome_id = collapse(genome_id),
+    taxid_best = taxid[1], taxid = collapse(as.character(taxid)),
+    best_alignment_score_fwd = rmax(best_alignment_score_fwd),
+    best_alignment_score_rev = rmax(best_alignment_score_rev),
+    query_len_fwd = rmax(query_len_fwd), query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1],
+    query_len_rev = rmax(query_len_rev), query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1],
+    classified = rmax(classified), assigned_name = collapse(assigned_name),
+    assigned_taxid_best = assigned_taxid[1], assigned_taxid = collapse(as.character(assigned_taxid)),
+    assigned_hv = rmax(assigned_hv), hit_hv = rmax(hit_hv), encoded_hits = collapse(encoded_hits),
+    adj_score_fwd = rmax(adj_score_fwd), adj_score_rev = rmax(adj_score_rev)
+    ) %>% mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev))
+
+print(dim(reads_collapsed))
+
+# Write output
+write_tsv(reads_collapsed, hv_hits_collapsed)
diff --git a/scripts/count-viral-taxa.R b/scripts/count-viral-taxa.R
@@ -0,0 +1,121 @@
+#!/usr/bin/env Rscript
+
+library(optparse)
+library(tidyverse)
+
+# Set arguments
+option_list = list(
+  make_option(c("--reads"), type="character", default=NULL,
+              help="Path to TSV file of human-infecting virus read data."),
+  make_option(c("--taxa"), type="character", default=NULL,
+              help="Path to database of viral taxonomic relationships"),
+  make_option(c("--output"), type="character", default=NULL,
+              help="Path to output file.")
+)
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+# Set input and output paths
+read_db_path <- opt$reads
+taxa_db_path <- opt$taxa
+out_path   <- opt$output
+
+#=====================#
+# AUXILIARY FUNCTIONS #
+#=====================#
+
+count_children <- function(taxid_tree){
+    # Count number of direct children of each node in a taxid tree
+    n_children <- taxid_tree %>% group_by(parent_taxid) %>%
+        count(name="n_children") %>% rename(taxid = parent_taxid)
+    db_out <- taxid_tree %>% left_join(n_children, by="taxid") %>%
+        mutate(n_children = replace_na(n_children, 0))
+    return(db_out)
+}
+
+count_hits_direct <- function(read_db, taxid_tree, group_var){
+    # Count number of reads directly assigned to each taxid in a tree
+    taxids <- count_children(taxid_tree)
+    direct_hits_setup <- read_db %>% group_by(taxid, .data[[group_var]]) %>%
+        count(name="n_reads_direct", .drop=FALSE) %>%
+        pivot_wider(names_from=any_of(group_var), values_from=n_reads_direct,
+                    values_fill = 0)
+    direct_hits_joined <- taxids %>% left_join(direct_hits_setup, by="taxid")
+    direct_hits <- taxids %>% left_join(direct_hits_setup, by="taxid") %>%
+        pivot_longer(cols=-(taxid:n_children), names_to=group_var,
+                     values_to="n_reads_direct") %>%
+        mutate(n_reads_direct = replace_na(n_reads_direct, 0))
+    # Double pivot ensures all viruses have a count for all samples
+    return(direct_hits %>% select(-n_children))
+}
+
+count_hits_indirect <- function(direct_hits, taxid_tree, group_var){
+    # Count number of reads assigned to each taxid in a tree or its descendents
+    iter <- 0
+    taxids <- count_children(taxid_tree)
+    n_children <- taxids %>% select(taxid, n_children)
+    hits_unresolved <- direct_hits %>% left_join(n_children, by="taxid") %>%
+        mutate(n_reads_clade = n_reads_direct, n_reads_children = 0)
+    hits_resolved <- tibble()
+    cat("\tIteration ", iter, ": ", nrow(hits_unresolved), " unresolved, ", nrow(hits_resolved), " resolved.\n", sep="")
+    while (nrow(hits_unresolved) > 0){
+        iter <- iter + 1
+        # Identify leaf nodes
+        hits_leaf <- hits_unresolved %>% filter(n_children == 0)
+        # Calculate contributions to parent nodes
+        hits_children <- hits_leaf %>% group_by(parent_taxid, .data[[group_var]]) %>%
+            summarize(n_reads_children = sum(n_reads_clade), .groups = "drop") %>%
+            rename(taxid = parent_taxid)
+        # Add child reads to direct hits to calculate clade reads
+        hits_partial <- hits_unresolved %>% filter(n_children > 0) %>%
+            select(-n_children, -n_reads_children) %>%
+            left_join(hits_children, by=c("taxid", group_var)) %>%
+            mutate(n_reads_children = replace_na(n_reads_children, 0),
+                   n_reads_clade = n_reads_clade + n_reads_children)
+        # Add leaf nodes to resolved hits
+        hits_resolved <- hits_resolved %>% bind_rows(hits_leaf) %>%
+            select(-n_children, -n_reads_children)
+        # Recalculate child counts after removing leaf nodes
+        hits_n_children <- hits_partial %>% group_by(parent_taxid, .data[[group_var]]) %>%
+            count(name="n_children") %>% rename(taxid=parent_taxid)
+        hits_unresolved <- hits_partial %>%
+            left_join(hits_n_children, by=c("taxid", group_var)) %>%
+            mutate(n_children = replace_na(n_children, 0))
+        cat("\tIteration ", iter, ": ", nrow(hits_unresolved), " unresolved, ", nrow(hits_resolved), " resolved.\n", sep="")
+    }
+    return(hits_resolved %>% arrange(taxid))
+}
+
+count_hits <- function(read_db, taxid_tree, group_var){
+    # Count direct and indirect hits for each taxon in a set of human-viral reads
+    cat("Counting direct hits...\n")
+    hits_direct <- count_hits_direct(read_db, taxid_tree, group_var)
+    cat("Done.\n")
+    cat("Counting indirect hits...\n")
+    hits_indirect <- count_hits_indirect(hits_direct, taxid_tree, group_var)
+    cat("Done.\n")
+    return(hits_indirect)
+}
+
+#============#
+# RUN SCRIPT #
+#============#
+
+# Import data
+read_db <- read_tsv(read_db_path, show_col_types = FALSE) %>% mutate(taxid = as.integer(taxid))
+taxa_db <- read_tsv(taxa_db_path, show_col_types = FALSE) %>% mutate(taxid = as.integer(taxid))
+
+if (nrow(read_db) > 0){
+    # Count hits
+    hits_db <- count_hits(read_db, taxa_db, "sample")
+    # Filter out null rows
+    hits_db_filtered <- filter(hits_db, n_reads_clade > 0)
+} else {
+    hits_db_filtered <- tibble(
+        taxid = numeric(), name = character(), rank = character(), parent_taxid = numeric(),
+        sample = character(), n_reads_direct = numeric(), n_reads_clade = numeric()
+    )
+}
+
+# Write output
+write_tsv(hits_db_filtered, out_path)
diff --git a/scripts/create_hv_clade_counts.py b/scripts/create_hv_clade_counts.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+import json
+import os
+import subprocess
+from pathlib import Path
+
+import matplotlib.pyplot as plt  # type: ignore
+import matplotlib.ticker as ticker  # type: ignore
+import numpy as np
+import pandas as pd
+import seaborn as sns  # type: ignore
+from matplotlib.gridspec import GridSpec  # type: ignore
+from collections import defaultdict
+
+if not os.path.exists("../bioprojects"):
+    os.mkdir("../bioprojects")
+
+
+TARGET_STUDY_METADATA = {
+    "Bengtsson-Palme 2016": ["PRJEB14051"],
+    "Munk 2022": [
+        "PRJEB13831",
+        "PRJEB27054",
+        "PRJEB27621",
+        "PRJEB40798",
+        "PRJEB40815",
+        "PRJEB40816",
+        "PRJEB51229",
+    ],
+    "Brinch 2020": ["PRJEB13832", "PRJEB34633"],
+    "Ng 2019": ["PRJNA438174"],
+    "Maritz 2019": ["PRJEB28033"],
+    "Brumfield 2022": ["PRJNA812772"],
+    "Yang 2020": ["PRJNA645711"],
+    "Spurbeck 2023": ["PRJNA924011"],
+    "CC 2021": ["PRJNA661613"],
+    "Rothman 2021": ["PRJNA729801"],  # not yet run through the pipeline
+}
+
+
+for study, bioprojects in TARGET_STUDY_METADATA.items():
+    for bioproject in bioprojects:
+        study_author = study.split()[0]
+
+        if not os.path.exists(
+            f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv"
+        ):
+            subprocess.run(
+                [
+                    "aws",
+                    "s3",
+                    "cp",
+                    f"s3://nao-mgs-wb/{study_author}-{bioproject}/output/hv_hits_putative_filtered.tsv.gz",
+                    f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv.gz",
+                ]
+            )
+            subprocess.run(
+                [
+                    "gunzip",
+                    f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv.gz",
+                ]
+            )
+        if not os.path.exists(
+            f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_collapsed.tsv"
+        ):
+            subprocess.run(
+                [
+                    "Rscript",
+                    "collapseHV.R",
+                    f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv",
+                    f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_collapsed.tsv",
+                ]
+            )
+        if not os.path.exists(f"../taxonomy/viral-taxids.tsv"):
+            subprocess.run(
+                [
+                    "aws",
+                    "s3",
+                    "cp",
+                    "s3://nao-mgs-wb/ref/results/viral-taxids.tsv.gz",
+                    "../taxonomy/viral-taxids.tsv.gz",
+                ]
+            )
+            subprocess.run(
+                [
+                    "gunzip",
+                    "../taxonomy/viral-taxids.tsv.gz",
+                ]
+            )
+
+        if not os.path.exists(
+            f"../bioprojects/{study_author}-{bioproject}/hv_clade_counts_new.tsv"
+        ):
+            subprocess.run(
+                [
+                    "Rscript",
+                    "count-viral-taxa.R",
+                    "--reads",
+                    f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_collapsed.tsv",
+                    "--taxa",
+                    "../taxonomy/viral-taxids.tsv",
+                    "--output",
+                    f"../bioprojects/{study_author}-{bioproject}/hv_clade_counts_new.tsv",
+                ]
+            )