-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Created a script to adadpt new mgs-workflow code to create newest ver…
…sion of hv-clade-counts.tsv, which creates correct (and slightly lower) n_reads_clade counts/
- Loading branch information
1 parent
9e69245
commit f7aaddf
Showing
3 changed files
with
265 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
library(tidyverse) | ||
|
||
args <- commandArgs(trailingOnly = TRUE) | ||
if (length(args) != 2) { | ||
stop("Usage: Rscript collapseHV-main.R <hv_hits_filtered> <hv_hits_collapsed>") | ||
} | ||
|
||
hv_hits_filtered <- args[1] | ||
hv_hits_collapsed <- args[2] | ||
|
||
rmax <- function(x){ | ||
if (all(is.na(x))) return(NA) | ||
return(max(x, na.rm = TRUE))} | ||
|
||
collapse <- function(x) ifelse(all(x == x[1]), x[1], paste(x, collapse="/")) | ||
|
||
hits_filtered <- read_tsv(hv_hits_filtered, col_names = TRUE, show_col_types = FALSE) | ||
print(dim(hits_filtered)) | ||
|
||
reads_collapsed <- hits_filtered %>% group_by(seq_id) %>% summarize( | ||
sample = collapse(sample), genome_id = collapse(genome_id), | ||
taxid_best = taxid[1], taxid = collapse(as.character(taxid)), | ||
best_alignment_score_fwd = rmax(best_alignment_score_fwd), | ||
best_alignment_score_rev = rmax(best_alignment_score_rev), | ||
query_len_fwd = rmax(query_len_fwd), query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1], | ||
query_len_rev = rmax(query_len_rev), query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1], | ||
classified = rmax(classified), assigned_name = collapse(assigned_name), | ||
assigned_taxid_best = assigned_taxid[1], assigned_taxid = collapse(as.character(assigned_taxid)), | ||
assigned_hv = rmax(assigned_hv), hit_hv = rmax(hit_hv), encoded_hits = collapse(encoded_hits), | ||
adj_score_fwd = rmax(adj_score_fwd), adj_score_rev = rmax(adj_score_rev) | ||
) %>% mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev)) | ||
|
||
print(dim(reads_collapsed)) | ||
|
||
# Write output | ||
write_tsv(reads_collapsed, hv_hits_collapsed) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
library(optparse) | ||
library(tidyverse) | ||
|
||
# Set arguments | ||
option_list = list( | ||
make_option(c("--reads"), type="character", default=NULL, | ||
help="Path to TSV file of human-infecting virus read data."), | ||
make_option(c("--taxa"), type="character", default=NULL, | ||
help="Path to database of viral taxonomic relationships"), | ||
make_option(c("--output"), type="character", default=NULL, | ||
help="Path to output file.") | ||
) | ||
opt_parser = OptionParser(option_list=option_list); | ||
opt = parse_args(opt_parser); | ||
|
||
# Set input and output paths | ||
read_db_path <- opt$reads | ||
taxa_db_path <- opt$taxa | ||
out_path <- opt$output | ||
|
||
#=====================# | ||
# AUXILIARY FUNCTIONS # | ||
#=====================# | ||
|
||
count_children <- function(taxid_tree){ | ||
# Count number of direct children of each node in a taxid tree | ||
n_children <- taxid_tree %>% group_by(parent_taxid) %>% | ||
count(name="n_children") %>% rename(taxid = parent_taxid) | ||
db_out <- taxid_tree %>% left_join(n_children, by="taxid") %>% | ||
mutate(n_children = replace_na(n_children, 0)) | ||
return(db_out) | ||
} | ||
|
||
count_hits_direct <- function(read_db, taxid_tree, group_var){ | ||
# Count number of reads directly assigned to each taxid in a tree | ||
taxids <- count_children(taxid_tree) | ||
direct_hits_setup <- read_db %>% group_by(taxid, .data[[group_var]]) %>% | ||
count(name="n_reads_direct", .drop=FALSE) %>% | ||
pivot_wider(names_from=any_of(group_var), values_from=n_reads_direct, | ||
values_fill = 0) | ||
direct_hits_joined <- taxids %>% left_join(direct_hits_setup, by="taxid") | ||
direct_hits <- taxids %>% left_join(direct_hits_setup, by="taxid") %>% | ||
pivot_longer(cols=-(taxid:n_children), names_to=group_var, | ||
values_to="n_reads_direct") %>% | ||
mutate(n_reads_direct = replace_na(n_reads_direct, 0)) | ||
# Double pivot ensures all viruses have a count for all samples | ||
return(direct_hits %>% select(-n_children)) | ||
} | ||
|
||
count_hits_indirect <- function(direct_hits, taxid_tree, group_var){ | ||
# Count number of reads assigned to each taxid in a tree or its descendents | ||
iter <- 0 | ||
taxids <- count_children(taxid_tree) | ||
n_children <- taxids %>% select(taxid, n_children) | ||
hits_unresolved <- direct_hits %>% left_join(n_children, by="taxid") %>% | ||
mutate(n_reads_clade = n_reads_direct, n_reads_children = 0) | ||
hits_resolved <- tibble() | ||
cat("\tIteration ", iter, ": ", nrow(hits_unresolved), " unresolved, ", nrow(hits_resolved), " resolved.\n", sep="") | ||
while (nrow(hits_unresolved) > 0){ | ||
iter <- iter + 1 | ||
# Identify leaf nodes | ||
hits_leaf <- hits_unresolved %>% filter(n_children == 0) | ||
# Calculate contributions to parent nodes | ||
hits_children <- hits_leaf %>% group_by(parent_taxid, .data[[group_var]]) %>% | ||
summarize(n_reads_children = sum(n_reads_clade), .groups = "drop") %>% | ||
rename(taxid = parent_taxid) | ||
# Add child reads to direct hits to calculate clade reads | ||
hits_partial <- hits_unresolved %>% filter(n_children > 0) %>% | ||
select(-n_children, -n_reads_children) %>% | ||
left_join(hits_children, by=c("taxid", group_var)) %>% | ||
mutate(n_reads_children = replace_na(n_reads_children, 0), | ||
n_reads_clade = n_reads_clade + n_reads_children) | ||
# Add leaf nodes to resolved hits | ||
hits_resolved <- hits_resolved %>% bind_rows(hits_leaf) %>% | ||
select(-n_children, -n_reads_children) | ||
# Recalculate child counts after removing leaf nodes | ||
hits_n_children <- hits_partial %>% group_by(parent_taxid, .data[[group_var]]) %>% | ||
count(name="n_children") %>% rename(taxid=parent_taxid) | ||
hits_unresolved <- hits_partial %>% | ||
left_join(hits_n_children, by=c("taxid", group_var)) %>% | ||
mutate(n_children = replace_na(n_children, 0)) | ||
cat("\tIteration ", iter, ": ", nrow(hits_unresolved), " unresolved, ", nrow(hits_resolved), " resolved.\n", sep="") | ||
} | ||
return(hits_resolved %>% arrange(taxid)) | ||
} | ||
|
||
count_hits <- function(read_db, taxid_tree, group_var){ | ||
# Count direct and indirect hits for each taxon in a set of human-viral reads | ||
cat("Counting direct hits...\n") | ||
hits_direct <- count_hits_direct(read_db, taxid_tree, group_var) | ||
cat("Done.\n") | ||
cat("Counting indirect hits...\n") | ||
hits_indirect <- count_hits_indirect(hits_direct, taxid_tree, group_var) | ||
cat("Done.\n") | ||
return(hits_indirect) | ||
} | ||
|
||
#============# | ||
# RUN SCRIPT # | ||
#============# | ||
|
||
# Import data | ||
read_db <- read_tsv(read_db_path, show_col_types = FALSE) %>% mutate(taxid = as.integer(taxid)) | ||
taxa_db <- read_tsv(taxa_db_path, show_col_types = FALSE) %>% mutate(taxid = as.integer(taxid)) | ||
|
||
if (nrow(read_db) > 0){ | ||
# Count hits | ||
hits_db <- count_hits(read_db, taxa_db, "sample") | ||
# Filter out null rows | ||
hits_db_filtered <- filter(hits_db, n_reads_clade > 0) | ||
} else { | ||
hits_db_filtered <- tibble( | ||
taxid = numeric(), name = character(), rank = character(), parent_taxid = numeric(), | ||
sample = character(), n_reads_direct = numeric(), n_reads_clade = numeric() | ||
) | ||
} | ||
|
||
# Write output | ||
write_tsv(hits_db_filtered, out_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import json | ||
import os | ||
import subprocess | ||
from pathlib import Path | ||
|
||
import matplotlib.pyplot as plt # type: ignore | ||
import matplotlib.ticker as ticker # type: ignore | ||
import numpy as np | ||
import pandas as pd | ||
import seaborn as sns # type: ignore | ||
from matplotlib.gridspec import GridSpec # type: ignore | ||
from collections import defaultdict | ||
|
||
if not os.path.exists("../bioprojects"): | ||
os.mkdir("../bioprojects") | ||
|
||
|
||
TARGET_STUDY_METADATA = { | ||
"Bengtsson-Palme 2016": ["PRJEB14051"], | ||
"Munk 2022": [ | ||
"PRJEB13831", | ||
"PRJEB27054", | ||
"PRJEB27621", | ||
"PRJEB40798", | ||
"PRJEB40815", | ||
"PRJEB40816", | ||
"PRJEB51229", | ||
], | ||
"Brinch 2020": ["PRJEB13832", "PRJEB34633"], | ||
"Ng 2019": ["PRJNA438174"], | ||
"Maritz 2019": ["PRJEB28033"], | ||
"Brumfield 2022": ["PRJNA812772"], | ||
"Yang 2020": ["PRJNA645711"], | ||
"Spurbeck 2023": ["PRJNA924011"], | ||
"CC 2021": ["PRJNA661613"], | ||
"Rothman 2021": ["PRJNA729801"], # not yet run through the pipeline | ||
} | ||
|
||
|
||
for study, bioprojects in TARGET_STUDY_METADATA.items(): | ||
for bioproject in bioprojects: | ||
study_author = study.split()[0] | ||
|
||
if not os.path.exists( | ||
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv" | ||
): | ||
subprocess.run( | ||
[ | ||
"aws", | ||
"s3", | ||
"cp", | ||
f"s3://nao-mgs-wb/{study_author}-{bioproject}/output/hv_hits_putative_filtered.tsv.gz", | ||
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv.gz", | ||
] | ||
) | ||
subprocess.run( | ||
[ | ||
"gunzip", | ||
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv.gz", | ||
] | ||
) | ||
if not os.path.exists( | ||
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_collapsed.tsv" | ||
): | ||
subprocess.run( | ||
[ | ||
"Rscript", | ||
"collapseHV.R", | ||
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_filtered.tsv", | ||
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_collapsed.tsv", | ||
] | ||
) | ||
if not os.path.exists(f"../taxonomy/viral-taxids.tsv"): | ||
subprocess.run( | ||
[ | ||
"aws", | ||
"s3", | ||
"cp", | ||
"s3://nao-mgs-wb/ref/results/viral-taxids.tsv.gz", | ||
"../taxonomy/viral-taxids.tsv.gz", | ||
] | ||
) | ||
subprocess.run( | ||
[ | ||
"gunzip", | ||
"../taxonomy/viral-taxids.tsv.gz", | ||
] | ||
) | ||
|
||
if not os.path.exists( | ||
f"../bioprojects/{study_author}-{bioproject}/hv_clade_counts_new.tsv" | ||
): | ||
subprocess.run( | ||
[ | ||
"Rscript", | ||
"count-viral-taxa.R", | ||
"--reads", | ||
f"../bioprojects/{study_author}-{bioproject}/hv_hits_putative_collapsed.tsv", | ||
"--taxa", | ||
"../taxonomy/viral-taxids.tsv", | ||
"--output", | ||
f"../bioprojects/{study_author}-{bioproject}/hv_clade_counts_new.tsv", | ||
] | ||
) |