naobservatory · jeffkaufman · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024 · dp-rice
diff --git a/figures/counts-by-pathogen-and-study.py b/figures/counts-by-pathogen-and-study.py
@@ -86,9 +86,9 @@ def start():
                 matching_reads = mgs_data.viral_reads(bioproject, taxids)
                 viral_samples = mgs_data.sample_attributes(
                     bioproject,
-                    enrichment=None
-                    if study == "brinch"
-                    else mgs.Enrichment.VIRAL,
+                    enrichment=(
+                        None if study == "brinch" else mgs.Enrichment.VIRAL
+                    ),
                 )
                 n_samples += len(viral_samples)
                 counts = [

diff --git a/fit.py b/fit.py
@@ -37,7 +37,7 @@ def start(num_samples: int, plot: bool) -> None:
     ) in predictors_by_taxid():
         taxids_str = "_".join(str(t) for t in taxids)
         for study, bioprojects in target_bioprojects.items():
-            enrichment = None if study == "brinch" else Enrichment.VIRAL
+            enrichment = None
             model = stats.build_model(
                 mgs_data,
                 bioprojects,

diff --git a/fits_summary.tsv b/fits_summary.tsv
diff --git a/input.tsv b/input.tsv
diff --git a/mgs.py b/mgs.py
@@ -1,3 +1,4 @@
+import os
 import json
 import urllib.request
 from collections import Counter
@@ -14,7 +15,7 @@
 
 MGS_REPO_DEFAULTS = {
     "user": "naobservatory",
-    "repo": "mgs-pipeline",
+    "repo": "mgs-restricted",
     "ref": "data-2023-07-21",
 }
 
@@ -23,10 +24,8 @@
 
 
 target_bioprojects = {
-    "crits_christoph": [BioProject("PRJNA661613")],
-    "rothman": [BioProject("PRJNA729801")],
-    "spurbeck": [BioProject("PRJNA924011")],
-    "brinch": [BioProject("PRJEB13832"), BioProject("PRJEB34633")],
+    "johnson": [BioProject("MJ-2024-02-12"),
+                BioProject("MJ-2024-03-04")],
 }
 
 
@@ -37,18 +36,9 @@ class GitHubRepo:
     ref: str
 
     def get_file(self, path: str) -> str:
-        file_url = (
-            f"https://raw.githubusercontent.com/"
-            f"{self.user}/{self.repo}/{self.ref}/{path}"
-        )
-        with urllib.request.urlopen(file_url) as response:
-            if response.status == 200:
-                return response.read()
-            else:
-                raise ValueError(
-                    f"Failed to download {file_url}. "
-                    f"Response status code: {response.status}"
-                )
+        with open(os.path.expanduser(
+                f"~/code/{self.repo}/{path}")) as inf:
+            return inf.read()
 
 
 def load_bioprojects(repo: GitHubRepo) -> dict[BioProject, list[Sample]]:
@@ -68,20 +58,33 @@ class SampleAttributes(BaseModel):
     country: str
     state: Optional[str] = None
     county: Optional[str] = None
-    location: str
+    location: Optional[str] = None
     fine_location: Optional[str] = None
     # Fixme: Not all the dates are real dates
     date: date | str
     reads: int
+    edta_treated: Optional[bool|str] = False
+    readlengths: Optional[dict] = None
+    ribofrac: Optional[float] = None
+    nuclease_treated: Optional[bool|str] = False
+    sample_volume_ml: Optional[str] = None
     enrichment: Optional[Enrichment] = None
+    concentration_method: Optional[str] = None
     method: Optional[str] = None
+    airport: Optional[str] = None
+    collection: Optional[str] = None
+
 
 
 def load_sample_attributes(repo: GitHubRepo) -> dict[Sample, SampleAttributes]:
     data = json.loads(repo.get_file("dashboard/metadata_samples.json"))
-    return {
-        Sample(s): SampleAttributes(**attribs) for s, attribs in data.items()
-    }
+    sa = {Sample(s): SampleAttributes(**attribs)
+          for s, attribs in data.items()}
+    for a in sa.values():
+        a.location = "n/a"
+        a.fine_location = "n/a"
+
+    return sa
 
 
 SampleCounts = dict[TaxID, dict[Sample, int]]
@@ -106,9 +109,11 @@ def make_count_tree(
     taxtree: Tree[TaxID], sample_counts: SampleCounts
 ) -> Tree[tuple[TaxID, Counter[Sample]]]:
     return taxtree.map(
-        lambda taxid: (taxid, Counter(sample_counts[taxid]))
-        if taxid in sample_counts
-        else (taxid, Counter()),
+        lambda taxid: (
+            (taxid, Counter(sample_counts[taxid]))
+            if taxid in sample_counts
+            else (taxid, Counter())
+        ),
     )
 
 
@@ -151,14 +156,14 @@ def sample_attributes(
         samples = {
             s: self.sample_attrs[s] for s in self.bioprojects[bioproject]
         }
-        if enrichment:
-            return {
-                s: attrs
-                for s, attrs in samples.items()
-                if attrs.enrichment == enrichment
-            }
-        else:
-            return samples
+        return {
+            s: attrs
+            for s, attrs in samples.items()
+            # remove them from the second run when it was True/False, but not
+            # the first run when it was Yes/No.  The second run it was much to
+            # agressive.
+            if not attrs.edta_treated == True
+        }
 
     def total_reads(self, bioproject: BioProject) -> dict[Sample, int]:
         return {

diff --git a/pathogen_properties.py b/pathogen_properties.py
@@ -301,8 +301,7 @@ def average(in1: "Scalar", in2: "Scalar"):
 
 class Predictor(abc.ABC, Variable):
     @abc.abstractmethod
-    def get_data(self) -> float:
-        ...
+    def get_data(self) -> float: ...
 
 
 @dataclass(kw_only=True, eq=True, frozen=True)

diff --git a/pathogens/aav2.py b/pathogens/aav2.py
@@ -29,7 +29,7 @@
     # (93K%2C%20docx)-,Supplemental%20data%3A,-Click%20here%20to
     date="2022",
     active=Active.LATENT,
-    source="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9063149/#:~:text=Seropositivity%20for%20(A)%20the%20global%20population"
+    source="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9063149/#:~:text=Seropositivity%20for%20(A)%20the%20global%20population",
     # This number matches AAV-2 seroprevalence in a study of 101 males with
     # Duchenne Muscular Dystrophy, showing a seroprevalence of 56%:
     # "https://pubmed.ncbi.nlm.nih.gov/36324212/#:~:text=We%20prospectively%20enrolled,and%20AAV8%20(47%25)."

diff --git a/pathogens/bkv.py b/pathogens/bkv.py
@@ -61,7 +61,7 @@
     date="2007",
     # Study is from 2009, sera are from 2007
     active=Active.LATENT,
-    source="https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=We%20found%20the%20seroprevalence%20(%2B/%E2%88%92%201%25)%20in%20healthy%20adult%20blood%20donors%20(1501)%20was%20SV40%20(9%25)%2C%20BKV%20(82%25)%2C%20JCV%20(39%25)%2C%20LPV"
+    source="https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=We%20found%20the%20seroprevalence%20(%2B/%E2%88%92%201%25)%20in%20healthy%20adult%20blood%20donors%20(1501)%20was%20SV40%20(9%25)%2C%20BKV%20(82%25)%2C%20JCV%20(39%25)%2C%20LPV",
     # A tabular breakdown can be found here: "https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=original%20image-,Table%201.,-Seroprevalence%20of%20polyomaviruses"
 )
 

diff --git a/pathogens/hcv.py b/pathogens/hcv.py
@@ -25,7 +25,7 @@
     date="2020",
     country="United States",
     state="Ohio",
-    source="https://odh.ohio.gov/wps/wcm/connect/gov/ec0dec22-1eea-4d17-a86a-ac4bc35be4d3/HCV+5+Year+Report+2021.pdf?MOD=AJPERES&CONVERT_TO=url&CACHEID=ROOTWORKSPACE.Z18_M1HGGIK0N0JO00QO9DDDDM3000-ec0dec22-1eea-4d17-a86a-ac4bc35be4d3-oqU9kQ8"
+    source="https://odh.ohio.gov/wps/wcm/connect/gov/ec0dec22-1eea-4d17-a86a-ac4bc35be4d3/HCV+5+Year+Report+2021.pdf?MOD=AJPERES&CONVERT_TO=url&CACHEID=ROOTWORKSPACE.Z18_M1HGGIK0N0JO00QO9DDDDM3000-ec0dec22-1eea-4d17-a86a-ac4bc35be4d3-oqU9kQ8",
     # Page 1 of 8
 )
 

diff --git a/pathogens/hiv.py b/pathogens/hiv.py
@@ -89,7 +89,7 @@
     people=652_221,
     country="Denmark",
     date="2022",
-    source="https://kk.statistikbank.dk/statbank5a/SelectVarVal/Define.asp?MainTable=KKBEF1"
+    source="https://kk.statistikbank.dk/statbank5a/SelectVarVal/Define.asp?MainTable=KKBEF1",
     # Search variables: Copenhagen in total, Gender in general, Age in total,
     # 2022 Q4
 )

diff --git a/pathogens/influenza.py b/pathogens/influenza.py
@@ -140,6 +140,19 @@ def load_weekly_data() -> WeeklyData:
     source="https://www.cdc.gov/flu/about/burden/2021-2022.htm#:~:text=The%20overall%20burden%20of%20influenza%20(flu)%20for%20the%202021%2D2022%20season%20was%20an%20estimated%C2%A09%20million%C2%A0flu%20illnesses",
 )
 
+infections_2023_2024 = IncidenceAbsolute(
+    annual_infections=41_500_000,
+    confidence_interval=(29_000_000, 54_000_000),
+    # The estimate is only part of the year, but the code that works with these
+    # understands start and end dates.
+    start_date="2023-10-01",
+    end_date="2024-03-09",
+    tag="us-2023-2024",
+    country="United States",
+    # "CDC estimates that, from October 1, 2023 through March 9, 2024, there
+    # have been: 29 – 54 million flu illnesses"
+    source="https://www.cdc.gov/flu/about/burden/preliminary-in-season-estimates.htm"
+)
 
 def compare_incidence_to_positive_tests(
     official_incidence: IncidenceAbsolute, weekly_data: WeeklyData
@@ -190,6 +203,9 @@ def estimate_incidences() -> list[IncidenceRate]:
     underreporting_2021_2022 = compare_incidence_to_positive_tests(
         infections_2021_2022, weekly_data
     )
+    underreporting_2023_2024 = compare_incidence_to_positive_tests(
+        infections_2023_2024, weekly_data
+    )
 
     # The CDC didn't estimate an annual infections for 2020-2021, but assume
     # underreporting is the average of the two years we do have.
@@ -220,11 +236,16 @@ def get_underreporting(
             and start <= infections_2021_2022.parsed_start
         ):
             return underreporting_2020_2021
+        elif (
+            start >= infections_2023_2024.parsed_start
+            and start <= infections_2023_2024.parsed_end
+        ):
+            return underreporting_2023_2024
         else:
             return None
 
     incidences = []
-    for state in ["California", "Ohio"]:
+    for state in ["Missouri"]:
         for parsed_start in weekly_data[state]:
             positive_a, positive_b = weekly_data[state][parsed_start]
             parsed_end = parsed_start + datetime.timedelta(weeks=1)

diff --git a/pathogens/jcv.py b/pathogens/jcv.py
@@ -57,7 +57,7 @@
     date="2007",
     # Study is from 2009, sera are from 2007
     active=Active.LATENT,
-    source="https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=We%20found%20the%20seroprevalence%20(%2B/%E2%88%92%201%25)%20in%20healthy%20adult%20blood%20donors%20(1501)%20was%20SV40%20(9%25)%2C%20BKV%20(82%25)%2C%20JCV%20(39%25)%2C%20LPV"
+    source="https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=We%20found%20the%20seroprevalence%20(%2B/%E2%88%92%201%25)%20in%20healthy%20adult%20blood%20donors%20(1501)%20was%20SV40%20(9%25)%2C%20BKV%20(82%25)%2C%20JCV%20(39%25)%2C%20LPV",
     # A tabular breakdown can be found here: "https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1000363#:~:text=original%20image-,Table%201.,-Seroprevalence%20of%20polyomaviruses"
 )
 

diff --git a/pathogens/mcv.py b/pathogens/mcv.py
@@ -38,7 +38,7 @@
     country="United States",
     date="2009",
     active=Active.LATENT,
-    source="https://academic.oup.com/jnci/article/101/21/1510/963538#:~:text=A%20cut%20point%20of%2015%E2%80%89000%20median%20fluorescent%20intensity%20units%20was%20chosen%20as%20described%20above%20and%20used%20to%20identify%20268%20(59.4%25)%20of%20the%20451%20control%20subjects%20as%20seropositive%20for%20MCPyV%20VP1%3B%20this%20percentage%20was%20similar%20to%20that%20among%20women%20in%20control%20group%201%20(ie%2C%2053%25)."
+    source="https://academic.oup.com/jnci/article/101/21/1510/963538#:~:text=A%20cut%20point%20of%2015%E2%80%89000%20median%20fluorescent%20intensity%20units%20was%20chosen%20as%20described%20above%20and%20used%20to%20identify%20268%20(59.4%25)%20of%20the%20451%20control%20subjects%20as%20seropositive%20for%20MCPyV%20VP1%3B%20this%20percentage%20was%20similar%20to%20that%20among%20women%20in%20control%20group%201%20(ie%2C%2053%25).",
     # Other studies give numbers that are up to 20 percentage points smaller
     # or larger:
     # "In an early study from the United States (US), MCPyV seroprevalence in

diff --git a/populations.py b/populations.py
@@ -1,3 +1,5 @@
+import datetime
+import dataclasses
 from typing import Optional
 
 from pathogen_properties import Population, prevalence_data_filename
@@ -29,9 +31,17 @@ def load_location_populations():
 def us_population(
     year: int, county: Optional[str] = None, state: Optional[str] = None
 ) -> Population:
+
+    if year in [2023, 2024]:
+        pop_2022 = us_population(2022, county, state)
+        return dataclasses.replace(
+            pop_2022,
+            parsed_start=datetime.date(year, 7, 1),
+            parsed_end=datetime.date(year, 7, 1))
+
     if year not in [2020, 2021, 2022]:
         raise Exception("Unsupported year: %s" % year)
-
+    
     total_people = 0
     # All estimates are July 1st, specifically.
     pop_date = "%s-07-01" % year