Skip to content

Commit

Permalink
edited supplement_table_1.py toi create a TSV.
Browse files Browse the repository at this point in the history
  • Loading branch information
simonleandergrimm committed Dec 19, 2023
1 parent cfb7629 commit 233e1bc
Showing 1 changed file with 205 additions and 0 deletions.
205 changes: 205 additions & 0 deletions preprint_stats/supplement_table_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
import os
import json
import gzip
import subprocess
from scipy.stats import gmean

if os.path.basename(os.getcwd()) != "preprint_stats":
raise RuntimeError("Run this script from preprint_stats/")


dashboard = os.path.expanduser("~/code/mgs-pipeline/dashboard/")

with open(os.path.join(dashboard, "human_virus_sample_counts.json")) as inf:
human_virus_sample_counts = json.load(inf)

with open(os.path.join(dashboard, "metadata_samples.json")) as inf:
metadata_samples = json.load(inf)

with open(os.path.join(dashboard, "metadata_bioprojects.json")) as inf:
metadata_bioprojects = json.load(inf)

with open(os.path.join(dashboard, "metadata_papers.json")) as inf:
metadata_papers = json.load(inf)

studies = list(metadata_papers.keys())


def create_table():
study_abundances = {}

virus_taxid = 10239

for study in studies:
if study not in [
"Bengtsson-Palme 2016",
"Munk 2022",
"Brinch 2020",
"Ng 2019",
"Maritz 2019",
"Brumfield 2022",
"Rothman 2021",
"Yang 2020",
"Spurbeck 2023",
"Crits-Christoph 2021",
]:
continue

for bioproject in metadata_papers[study]["projects"]:
samples = metadata_bioprojects[bioproject]

if study == "Bengtsson-Palme 2016":
samples = [
sample
for sample in samples
if metadata_samples[sample]["fine_location"].startswith(
"Inlet"
)
]

if study == "Ng 2019":
samples = [
sample
for sample in samples
if metadata_samples[sample]["fine_location"] == "Influent"
]

for sample in samples:
if metadata_samples[sample].get("enrichment") == "panel":
continue

cladecounts = "%s.tsv.gz" % sample
if not os.path.exists(f"../cladecounts/{cladecounts}"):
subprocess.check_call(
[
"aws",
"s3",
"cp",
"s3://nao-mgs/%s/cladecounts/%s"
% (bioproject, cladecounts),
"../cladecounts/",
]
)
with gzip.open(
f"../cladecounts/{cladecounts}", mode="rt"
) as inf:
for line in inf:
(
line_taxid,
_,
_,
clade_assignments,
_,
) = line.strip().split()
taxid = int(line_taxid)
clade_hits = int(clade_assignments)
if taxid == virus_taxid:
if study not in study_abundances:
study_abundances[study] = [
[],
[],
]
study_abundances[study][1].append(
clade_hits / metadata_samples[sample]["reads"]
)
humanreads = "%s.humanviruses.tsv" % sample

if not os.path.exists(f"../humanviruses/{humanreads}"):
print(
"Downloading %s from %s" % (humanreads, bioproject),
flush=True,
)
subprocess.check_call(
[
"aws",
"s3",
"cp",
"s3://nao-mgs/%s/humanviruses/%s"
% (bioproject, humanreads),
"../humanviruses/",
]
)

with open(f"../humanviruses/{humanreads}") as inf:
human_virus_reads = 0
for line in inf:
(
line_taxid,
clade_assignments,
_,
) = line.strip().split("\t")
clade_hits = int(clade_assignments)
human_virus_reads += int(clade_hits)
human_virus_relative_abundance = (
human_virus_reads / metadata_samples[sample]["reads"]
)

if human_virus_relative_abundance > 0.0:
study_abundances[study][0].append(
human_virus_relative_abundance
)

gmean_human_virus_shares = []
gmean_all_virus_shares = []
table = [
(
"Study",
"Human Viruses",
"All Viruses",
)
]

for study, abundances in study_abundances.items():
gmean_human_virus_shares.append(gmean(abundances[0]))
gmean_all_virus_shares.append(gmean(abundances[1]))
table.append(
(
study,
gmean(abundances[0]),
gmean(abundances[1]),
)
)

table.append(
(
"All studies",
gmean(gmean_human_virus_shares),
gmean(gmean_all_virus_shares),
)
)

return table


def format_number(num):
ROUNDING_DIGITS = 2
if "e" in "{:.9e}".format(num):
base, exponent = "{:.9e}".format(num).split("e")
rounded_base = round(float(base), ROUNDING_DIGITS)
target_read_per_n = int(1 / num)
return "{} * 10^{} (1 in {})".format(
rounded_base, int(exponent), target_read_per_n
)
else:
return str(num)


def start():
table = create_table()
formatted_table = []
for row in table:
formatted_row = []
for cell in row:
if isinstance(cell, float):
formatted_row.append(format_number(cell))
else:
formatted_row.append(str(cell))
formatted_table.append("\t".join(formatted_row))

with open("supplement_table_1.tsv", "w") as file:
for line in formatted_table:
file.write(line + "\n")


if __name__ == "__main__":
start()

0 comments on commit 233e1bc

Please sign in to comment.