From 79f4cd7ce8d592e9ca8eb09f2c4c859f073f8ad6 Mon Sep 17 00:00:00 2001 From: simonleandergrimm Date: Mon, 4 Dec 2023 10:23:18 -0500 Subject: [PATCH] removed scripts that aren't needed for the preprint p2ra repo. --- authors.txt | 16 ------- determine_pseudocounts.py | 45 ------------------- generate_numbers_for_discussion.py | 69 ------------------------------ get_rothman_virus_counts.py | 29 ------------- list_taxids.py | 26 ----------- process-authors.py | 39 ----------------- 6 files changed, 224 deletions(-) delete mode 100644 authors.txt delete mode 100755 determine_pseudocounts.py delete mode 100755 generate_numbers_for_discussion.py delete mode 100755 get_rothman_virus_counts.py delete mode 100755 list_taxids.py delete mode 100755 process-authors.py diff --git a/authors.txt b/authors.txt deleted file mode 100644 index f61ce88..0000000 --- a/authors.txt +++ /dev/null @@ -1,16 +0,0 @@ -Bengtsson-Palme: Johan Bengtsson-Palme, Rickard Hammarén, Chandan Pal, Marcus Östman, Berndt Björlenius, Carl-Fredrik Flach, Jerker Fick, Erik Kristiansson, Mats Tysklind, D G Joakim Larsson -Brinch: Christian Brinch, Pimlapas Leekitcharoenphon, Ana S. R. Duarte, Christina A. Svendsen, Jacob D. Jensen, and Frank M. Aarestrup -Brumfield: Kyle D. Brumfield, Menu Leddy, Moiz Usmani, Joseph A. Cotruvo, Ching-Tzone Tien, Suzanne Dorsey, Karlis Graubics, Brian Fanelli, Isaac Zhou, Nathaniel Registe, Manoj Dadlani, Malinda Wimalarante, Dilini Jinasena, Rushan Abayagunawardena, Chiran Withanachchi, Anwar Huq, Antarpreet Jutla, Rita R. Colwell -Crits-Christoph: Alexander Crits-Christoph, Rose S. Kantor, Matthew R. Olm, Oscar N. Whitney, Basem Al-Shayeb, Yue Clare Lou, Avi Flamholz, Lauren C. Kennedy, Hannah Greenwald, Adrian Hinkle, Jonathan Hetzel, Sara Spitzer, Jeffery Koble, Asako Tan, Fred Hyde, Gary Schroth, Scott Kuersten, Jillian F. Banfield, and Kara L. Nelson -Cui: Han Cui, Jing Wang, Xiaoyu Cai, Kun Feng, Guo-Jun Xie, Bing-Feng Liu, and Defeng Xing -Fierer: Noah Fierer, Hannah Holland-Moritz, Alexandra Alexiev, Harpreet Batther, Nicholas B. Dragone, Liam Friar, Matthew J. Gebert, Sarah Gering, Jessica B. Henley, Sierra Jech, Emily M. Kibby, Tina Melie, William B. Patterson, Eric Peterson, Kyle Schutz, Elías Stallard-Olivera, John Sterrett, Corinne Walsh, Cresten Mansfeldt -Hendriksen: Rene S. Hendriksen, Oksana Lukjancenko, Patrick Munk, Mathis H. Hjelmsø, Jennifer R. Verani, Eric Ng’eno, Godfrey Bigogo, Samuel Kiplangat, Traoré Oumar, Lasse Bergmark, Timo Röder, John C. Neatherlin, Onyango Clayton, Tine Hald, Susanne Karlsmose, Sünje J. Pamp, Barry Fields, Joel M. Montgomery, Frank M. Aarestrup -Maritz: Julia M. Maritz, Theresa A. Ten Eyck, S. Elizabeth Alter, and Jane M. Carlton -McCall: Camille McCall, Ryan A. Leo Elworth, Kristine M. Wylie, Todd N. Wylie, Katherine Dyson, Ryan Doughty, Todd J. Treangen, Loren Hopkins, Katherine Ensor, Lauren B. Stadler -Munk: Patrick Munk, Christian Brinch, Frederik Duus Møller, Thomas N. Petersen, Rene S. Hendriksen, Anne Mette Seyfarth, Jette S. Kjeldgaard, Christina Aaby Svendsen, Bram van Bunnik, Fanny Berglund, the Global Sewage Surveillance Consortium, D. G. Joakim Larsson, Marion Koopmans, Mark Woolhouse, and Frank M. Aarestrup -Ng: Charmaine Ng, Boonfei Tan, Xiao-Tao Jiang, Xiaoqiong Gu, Hongjie Chen, Bradley William Schmitz, Laurence Haller, Francis Rathinam Charles, Tong Zhang, Karina Gin -Petersen: Thomas Nordahl Petersen, Simon Rasmussen, Henrik Hasman, Christian Carøe, Jacob Bælum, Anna Charlotte Schultz, Lasse Bergmark, Christina A. Svendsen, Ole Lund, Thomas Sicheritz-Pontén, and Frank M. Aarestrup -Rothman: Jason A Rothman, Theresa B Loveless, Joseph Kapcia, Eric D Adams, Joshua A Steele, Amity G Zimmer-Faust, Kylie Langlois, David Wanless, Madison Griffith, Lucy Mao, Jeffrey Chokry, John F Griffith, Katrine L Whiteson -Spurbeck: Rachel R. Spurbeck, Lindsay A. Catlin, Chiranjit Mukherjee, Anthony K. Smith, Angela Minard-Smith -Wang: Changzhi Wang, David Mantilla-Calderon, Yanghui Xiong, Mohsen Alkahtani, Yasir M. Bashawri, Hamed Al Qarni, and Pei-Ying Hong -Yang: Qian Yang, Pierre Rivailler, Shuangli Zhu, Dongmei Yan, Na Xie, Haishu Tang, Yong Zhang, Wenbo Xu diff --git a/determine_pseudocounts.py b/determine_pseudocounts.py deleted file mode 100755 index 1e0f183..0000000 --- a/determine_pseudocounts.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 - -import mgs -import pathogens - -mgs_data = mgs.MGSData.from_repo() -import stats - -for ( - pathogen_name, - tidy_name, - predictor_type, - taxids, - predictors, -) in pathogens.predictors_by_taxid(): - if not any(predictor.is_pseudocount for predictor in predictors): - continue - - n_samples = 0 - n_pseudocount_samples = 0 - - for study, bioprojects in mgs.target_bioprojects.items(): - for bioproject in bioprojects: - enrichment = None if study == "brinch" else mgs.Enrichment.VIRAL - chosen_predictors = { - sample: stats.lookup_variables(attrs, predictors) - for sample, attrs in mgs_data.sample_attributes( - bioproject, enrichment=enrichment - ).items() - } - if all(ps == [] for ps in chosen_predictors.values()): - continue - for sample, preds in chosen_predictors.items(): - (predictor,) = preds - n_samples += 1 - if predictor.is_pseudocount: - n_pseudocount_samples += 1 - - print( - tidy_name, - study, - n_pseudocount_samples, - n_samples, - "%.0f%%" % (100 * n_pseudocount_samples / n_samples), - ) diff --git a/generate_numbers_for_discussion.py b/generate_numbers_for_discussion.py deleted file mode 100755 index 5b07e9d..0000000 --- a/generate_numbers_for_discussion.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python3 - -AGNOSTIC_FOLLOWUP = 1000 # weekly reads matching pathogen -TARGETED_FOLLOWUP = 3 # weekly reads matching pathogen - -SEQUENCING_COST = 8000 / 1e9 # dollars per read - -cols = None -with open("fits_summary.tsv") as inf: - for line in inf: - row = line.strip().split("\t") - if not cols: - cols = row - continue - - if row[cols.index("location")] != "Overall": - continue - - pathogen = row[cols.index("pathogen")] - study = row[cols.index("study")] - median = float(row[cols.index("50%")]) - predictor_type = row[cols.index("predictor_type")] - is_prevalence = { - "prevalence": True, - "incidence": False, - }[predictor_type] - - # 1% prevalence or 0.5% weekly incidence - adjusted_relative_abundance = median if is_prevalence else median / 2 - - print(pathogen, study) - print(" RA1%% %.0e" % median) - print( - " Relative abundance of 1 in %.0e" - % (1 / adjusted_relative_abundance) - ) - print( - " Weekly reads to flag for manual followup: %.0e" - % (AGNOSTIC_FOLLOWUP * 1 / adjusted_relative_abundance) - ) - print( - " Weekly agnostic cost: $%.0f" - % ( - AGNOSTIC_FOLLOWUP - * 1 - / adjusted_relative_abundance - * SEQUENCING_COST - ) - ) - print( - " Annual agnostic cost: $%.0f" - % ( - AGNOSTIC_FOLLOWUP - * 1 - / adjusted_relative_abundance - * SEQUENCING_COST - * 52 - ) - ) - print( - " Annual targeted cost: $%.0f" - % ( - TARGETED_FOLLOWUP - * 1 - / adjusted_relative_abundance - * SEQUENCING_COST - * 52 - ) - ) diff --git a/get_rothman_virus_counts.py b/get_rothman_virus_counts.py deleted file mode 100755 index 4c7584c..0000000 --- a/get_rothman_virus_counts.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 - -import mgs -from pathogens import pathogens - -if __name__ == "__main__": - bioproject = mgs.BioProject("PRJNA729801") # Rothman - mgs_data = mgs.MGSData.from_repo() - samples = mgs_data.sample_attributes( - bioproject, enrichment=mgs.Enrichment.VIRAL - ) - - fine_locs = set(attribs.fine_location for _, attribs in samples.items()) - - for pathogen_name, pathogen in pathogens.items(): - print(pathogen_name) - taxids = pathogen.pathogen_chars.taxids - virus_reads = mgs_data.viral_reads(bioproject, taxids) - print(" All", sum(virus_reads[s] for s in samples), sep="\t") - for fine_loc in fine_locs: - print( - f" {fine_loc}", - sum( - virus_reads[s] - for s, attribs in samples.items() - if attribs.fine_location == fine_loc - ), - sep="\t", - ) diff --git a/list_taxids.py b/list_taxids.py deleted file mode 100755 index c5f23f8..0000000 --- a/list_taxids.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 -import pathogen_properties -import pathogens - - -def start(): - taxid_to_name = {} - for pathogen_name, pathogen in pathogens.pathogens.items(): - for taxids in pathogen_properties.by_taxids( - pathogen.pathogen_chars, pathogen.estimate_prevalences() - ): - for taxid in taxids: - taxid_to_name[taxid] = pathogen_name - for taxids in pathogen_properties.by_taxids( - pathogen.pathogen_chars, pathogen.estimate_incidences() - ): - for taxid in taxids: - taxid_to_name[taxid] = pathogen_name - - print("taxid", "filename", "human_readable", sep="\t") - for taxid, name in sorted(taxid_to_name.items()): - print(taxid, name, pathogens.tidy_name(name, [taxid]), sep="\t") - - -if __name__ == "__main__": - start() diff --git a/process-authors.py b/process-authors.py deleted file mode 100755 index 000bb3f..0000000 --- a/process-authors.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python3 - -import re -from collections import defaultdict - -by_last_name = defaultdict(set) -all_authors = set() -# Manually copied from papers -with open("authors.txt") as inf: - for line in inf: - line = line.removesuffix("\n") - paper, authors = line.split(": ") - - for author in authors.split(","): - author = author.strip() - author = author.removeprefix("and ") - - for f, r in [ - (".", ""), - ("Thomas Nordahl Petersen", "Thomas N Petersen"), - ("Christina Aaby Svendsen", "Christina A Svendsen"), - ]: - author = author.replace(f, r) - - # remove middle names - while re.match(".* [A-Z] .*", author): - author = re.sub("(.+) [A-Z] (.+)", r"\1 \2", author) - - by_last_name[author.split(" ")[-1]].add(author) - all_authors.add(author) - -for last_name in sorted(by_last_name): - if len(by_last_name[last_name]) == 1: - continue - print(last_name) - for author in sorted(by_last_name[last_name]): - print(" ", author) - -print(", ".join(sorted(all_authors)))