Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Johnson Flu A: hack together a rough P2RA estimate #240

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions figures/counts-by-pathogen-and-study.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ def start():
matching_reads = mgs_data.viral_reads(bioproject, taxids)
viral_samples = mgs_data.sample_attributes(
bioproject,
enrichment=None
if study == "brinch"
else mgs.Enrichment.VIRAL,
enrichment=(
None if study == "brinch" else mgs.Enrichment.VIRAL
),
)
n_samples += len(viral_samples)
counts = [
Expand Down
2 changes: 1 addition & 1 deletion fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def start(num_samples: int, plot: bool) -> None:
) in predictors_by_taxid():
taxids_str = "_".join(str(t) for t in taxids)
for study, bioprojects in target_bioprojects.items():
enrichment = None if study == "brinch" else Enrichment.VIRAL
enrichment = None
model = stats.build_model(
mgs_data,
bioprojects,
Expand Down
448 changes: 4 additions & 444 deletions fits_summary.tsv

Large diffs are not rendered by default.

6,114 changes: 28 additions & 6,086 deletions input.tsv

Large diffs are not rendered by default.

69 changes: 37 additions & 32 deletions mgs.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import json
import urllib.request
from collections import Counter
Expand All @@ -14,7 +15,7 @@

MGS_REPO_DEFAULTS = {
"user": "naobservatory",
"repo": "mgs-pipeline",
"repo": "mgs-restricted",
"ref": "data-2023-07-21",
}

Expand All @@ -23,10 +24,8 @@


target_bioprojects = {
"crits_christoph": [BioProject("PRJNA661613")],
"rothman": [BioProject("PRJNA729801")],
"spurbeck": [BioProject("PRJNA924011")],
"brinch": [BioProject("PRJEB13832"), BioProject("PRJEB34633")],
"johnson": [BioProject("MJ-2024-02-12"),
BioProject("MJ-2024-03-04")],
}


Expand All @@ -37,18 +36,9 @@ class GitHubRepo:
ref: str

def get_file(self, path: str) -> str:
file_url = (
f"https://raw.githubusercontent.com/"
f"{self.user}/{self.repo}/{self.ref}/{path}"
)
with urllib.request.urlopen(file_url) as response:
if response.status == 200:
return response.read()
else:
raise ValueError(
f"Failed to download {file_url}. "
f"Response status code: {response.status}"
)
with open(os.path.expanduser(
f"~/code/{self.repo}/{path}")) as inf:
return inf.read()


def load_bioprojects(repo: GitHubRepo) -> dict[BioProject, list[Sample]]:
Expand All @@ -68,20 +58,33 @@ class SampleAttributes(BaseModel):
country: str
state: Optional[str] = None
county: Optional[str] = None
location: str
location: Optional[str] = None
fine_location: Optional[str] = None
# Fixme: Not all the dates are real dates
date: date | str
reads: int
edta_treated: Optional[bool|str] = False
readlengths: Optional[dict] = None
ribofrac: Optional[float] = None
nuclease_treated: Optional[bool|str] = False
sample_volume_ml: Optional[str] = None
enrichment: Optional[Enrichment] = None
concentration_method: Optional[str] = None
method: Optional[str] = None
airport: Optional[str] = None
collection: Optional[str] = None



def load_sample_attributes(repo: GitHubRepo) -> dict[Sample, SampleAttributes]:
data = json.loads(repo.get_file("dashboard/metadata_samples.json"))
return {
Sample(s): SampleAttributes(**attribs) for s, attribs in data.items()
}
sa = {Sample(s): SampleAttributes(**attribs)
for s, attribs in data.items()}
for a in sa.values():
a.location = "n/a"
a.fine_location = "n/a"

return sa


SampleCounts = dict[TaxID, dict[Sample, int]]
Expand All @@ -106,9 +109,11 @@ def make_count_tree(
taxtree: Tree[TaxID], sample_counts: SampleCounts
) -> Tree[tuple[TaxID, Counter[Sample]]]:
return taxtree.map(
lambda taxid: (taxid, Counter(sample_counts[taxid]))
if taxid in sample_counts
else (taxid, Counter()),
lambda taxid: (
(taxid, Counter(sample_counts[taxid]))
if taxid in sample_counts
else (taxid, Counter())
),
)


Expand Down Expand Up @@ -151,14 +156,14 @@ def sample_attributes(
samples = {
s: self.sample_attrs[s] for s in self.bioprojects[bioproject]
}
if enrichment:
return {
s: attrs
for s, attrs in samples.items()
if attrs.enrichment == enrichment
}
else:
return samples
return {
s: attrs
for s, attrs in samples.items()
# remove them from the second run when it was True/False, but not
# the first run when it was Yes/No. The second run it was much to
# agressive.
if not attrs.edta_treated == True
}

def total_reads(self, bioproject: BioProject) -> dict[Sample, int]:
return {
Expand Down
3 changes: 1 addition & 2 deletions pathogen_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,8 +301,7 @@ def average(in1: "Scalar", in2: "Scalar"):

class Predictor(abc.ABC, Variable):
@abc.abstractmethod
def get_data(self) -> float:
...
def get_data(self) -> float: ...


@dataclass(kw_only=True, eq=True, frozen=True)
Expand Down
2 changes: 1 addition & 1 deletion pathogens/aav2.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
# (93K%2C%20docx)-,Supplemental%20data%3A,-Click%20here%20to
date="2022",
active=Active.LATENT,
source="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9063149/#:~:text=Seropositivity%20for%20(A)%20the%20global%20population"
source="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9063149/#:~:text=Seropositivity%20for%20(A)%20the%20global%20population",
# This number matches AAV-2 seroprevalence in a study of 101 males with
# Duchenne Muscular Dystrophy, showing a seroprevalence of 56%:
# "https://pubmed.ncbi.nlm.nih.gov/36324212/#:~:text=We%20prospectively%20enrolled,and%20AAV8%20(47%25)."
Expand Down
2 changes: 1 addition & 1 deletion pathogens/bkv.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
date="2007",
# Study is from 2009, sera are from 2007
active=Active.LATENT,
source="https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=We%20found%20the%20seroprevalence%20(%2B/%E2%88%92%201%25)%20in%20healthy%20adult%20blood%20donors%20(1501)%20was%20SV40%20(9%25)%2C%20BKV%20(82%25)%2C%20JCV%20(39%25)%2C%20LPV"
source="https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=We%20found%20the%20seroprevalence%20(%2B/%E2%88%92%201%25)%20in%20healthy%20adult%20blood%20donors%20(1501)%20was%20SV40%20(9%25)%2C%20BKV%20(82%25)%2C%20JCV%20(39%25)%2C%20LPV",
# A tabular breakdown can be found here: "https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=original%20image-,Table%201.,-Seroprevalence%20of%20polyomaviruses"
)

Expand Down
2 changes: 1 addition & 1 deletion pathogens/hcv.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
date="2020",
country="United States",
state="Ohio",
source="https://odh.ohio.gov/wps/wcm/connect/gov/ec0dec22-1eea-4d17-a86a-ac4bc35be4d3/HCV+5+Year+Report+2021.pdf?MOD=AJPERES&CONVERT_TO=url&CACHEID=ROOTWORKSPACE.Z18_M1HGGIK0N0JO00QO9DDDDM3000-ec0dec22-1eea-4d17-a86a-ac4bc35be4d3-oqU9kQ8"
source="https://odh.ohio.gov/wps/wcm/connect/gov/ec0dec22-1eea-4d17-a86a-ac4bc35be4d3/HCV+5+Year+Report+2021.pdf?MOD=AJPERES&CONVERT_TO=url&CACHEID=ROOTWORKSPACE.Z18_M1HGGIK0N0JO00QO9DDDDM3000-ec0dec22-1eea-4d17-a86a-ac4bc35be4d3-oqU9kQ8",
# Page 1 of 8
)

Expand Down
2 changes: 1 addition & 1 deletion pathogens/hiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
people=652_221,
country="Denmark",
date="2022",
source="https://kk.statistikbank.dk/statbank5a/SelectVarVal/Define.asp?MainTable=KKBEF1"
source="https://kk.statistikbank.dk/statbank5a/SelectVarVal/Define.asp?MainTable=KKBEF1",
# Search variables: Copenhagen in total, Gender in general, Age in total,
# 2022 Q4
)
Expand Down
23 changes: 22 additions & 1 deletion pathogens/influenza.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,19 @@ def load_weekly_data() -> WeeklyData:
source="https://www.cdc.gov/flu/about/burden/2021-2022.htm#:~:text=The%20overall%20burden%20of%20influenza%20(flu)%20for%20the%202021%2D2022%20season%20was%20an%20estimated%C2%A09%20million%C2%A0flu%20illnesses",
)

infections_2023_2024 = IncidenceAbsolute(
annual_infections=41_500_000,
confidence_interval=(29_000_000, 54_000_000),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a technical quibble, but I can't tell from glancing at the website whether their estimated range is technically a confidence interval.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, it may not be.

I've used the midpoint of the two numbers from the website; the range isn't currently used for anything.

# The estimate is only part of the year, but the code that works with these
# understands start and end dates.
start_date="2023-10-01",
end_date="2024-03-09",
tag="us-2023-2024",
country="United States",
# "CDC estimates that, from October 1, 2023 through March 9, 2024, there
# have been: 29 – 54 million flu illnesses"
source="https://www.cdc.gov/flu/about/burden/preliminary-in-season-estimates.htm"
)

def compare_incidence_to_positive_tests(
official_incidence: IncidenceAbsolute, weekly_data: WeeklyData
Expand Down Expand Up @@ -190,6 +203,9 @@ def estimate_incidences() -> list[IncidenceRate]:
underreporting_2021_2022 = compare_incidence_to_positive_tests(
infections_2021_2022, weekly_data
)
underreporting_2023_2024 = compare_incidence_to_positive_tests(
infections_2023_2024, weekly_data
)

# The CDC didn't estimate an annual infections for 2020-2021, but assume
# underreporting is the average of the two years we do have.
Expand Down Expand Up @@ -220,11 +236,16 @@ def get_underreporting(
and start <= infections_2021_2022.parsed_start
):
return underreporting_2020_2021
elif (
start >= infections_2023_2024.parsed_start
and start <= infections_2023_2024.parsed_end
):
return underreporting_2023_2024
else:
return None

incidences = []
for state in ["California", "Ohio"]:
for state in ["Missouri"]:
for parsed_start in weekly_data[state]:
positive_a, positive_b = weekly_data[state][parsed_start]
parsed_end = parsed_start + datetime.timedelta(weeks=1)
Expand Down
2 changes: 1 addition & 1 deletion pathogens/jcv.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
date="2007",
# Study is from 2009, sera are from 2007
active=Active.LATENT,
source="https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=We%20found%20the%20seroprevalence%20(%2B/%E2%88%92%201%25)%20in%20healthy%20adult%20blood%20donors%20(1501)%20was%20SV40%20(9%25)%2C%20BKV%20(82%25)%2C%20JCV%20(39%25)%2C%20LPV"
source="https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=We%20found%20the%20seroprevalence%20(%2B/%E2%88%92%201%25)%20in%20healthy%20adult%20blood%20donors%20(1501)%20was%20SV40%20(9%25)%2C%20BKV%20(82%25)%2C%20JCV%20(39%25)%2C%20LPV",
# A tabular breakdown can be found here: "https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=original%20image-,Table%201.,-Seroprevalence%20of%20polyomaviruses"
)

Expand Down
2 changes: 1 addition & 1 deletion pathogens/mcv.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
country="United States",
date="2009",
active=Active.LATENT,
source="https://academic.oup.com/jnci/article/101/21/1510/963538#:~:text=A%20cut%20point%20of%2015%E2%80%89000%20median%20fluorescent%20intensity%20units%20was%20chosen%20as%20described%20above%20and%20used%20to%20identify%20268%20(59.4%25)%20of%20the%20451%20control%20subjects%20as%20seropositive%20for%20MCPyV%20VP1%3B%20this%20percentage%20was%20similar%20to%20that%20among%20women%20in%20control%20group%201%20(ie%2C%2053%25)."
source="https://academic.oup.com/jnci/article/101/21/1510/963538#:~:text=A%20cut%20point%20of%2015%E2%80%89000%20median%20fluorescent%20intensity%20units%20was%20chosen%20as%20described%20above%20and%20used%20to%20identify%20268%20(59.4%25)%20of%20the%20451%20control%20subjects%20as%20seropositive%20for%20MCPyV%20VP1%3B%20this%20percentage%20was%20similar%20to%20that%20among%20women%20in%20control%20group%201%20(ie%2C%2053%25).",
# Other studies give numbers that are up to 20 percentage points smaller
# or larger:
# "In an early study from the United States (US), MCPyV seroprevalence in
Expand Down
12 changes: 11 additions & 1 deletion populations.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import datetime
import dataclasses
from typing import Optional

from pathogen_properties import Population, prevalence_data_filename
Expand Down Expand Up @@ -29,9 +31,17 @@ def load_location_populations():
def us_population(
year: int, county: Optional[str] = None, state: Optional[str] = None
) -> Population:

if year in [2023, 2024]:
pop_2022 = us_population(2022, county, state)
return dataclasses.replace(
pop_2022,
parsed_start=datetime.date(year, 7, 1),
parsed_end=datetime.date(year, 7, 1))

if year not in [2020, 2021, 2022]:
raise Exception("Unsupported year: %s" % year)

total_people = 0
# All estimates are July 1st, specifically.
pop_date = "%s-07-01" % year
Expand Down
Loading
Loading