From 3e49417d2c5e73a5a9c4f69ec54136d4f2bbf7bc Mon Sep 17 00:00:00 2001
From: simonleandergrimm <simonleandergrimm@gmail.com>
Date: Tue, 9 Jul 2024 13:51:21 -0400
Subject: [PATCH] swapped new and old labels in one line of the code

---
 ...2024-07-09-print-clade-count-comparison.py | 28 +++++--------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/scripts/2024-07-09-print-clade-count-comparison.py b/scripts/2024-07-09-print-clade-count-comparison.py
index a0e6169..0dc6107 100644
--- a/scripts/2024-07-09-print-clade-count-comparison.py
+++ b/scripts/2024-07-09-print-clade-count-comparison.py
@@ -1,8 +1,5 @@
 import os
 import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-import seaborn as sns
 from collections import defaultdict
 
 BIOPROJECT_DIR = "bioprojects"
@@ -38,15 +35,6 @@
 ]
 
 
-def read_clade_counts(file_path):
-    df = pd.read_csv(file_path, sep="\t")
-    sub_df = df[(df["taxid"] == 10239)]["n_reads_clade", "sample"]
-    dict = {}
-    for reads, sample in sub_df.itertuples():
-        dict[sample] = reads
-    return dict
-
-
 def collect_data():
     data = defaultdict(lambda: defaultdict(dict))
     for study, bioprojects in TARGET_STUDY_METADATA.items():
@@ -68,8 +56,7 @@ def collect_data():
                         n_reads_direct,
                         n_reads_clade,
                     ) = line.strip().split("\t")
-                    # if taxid == "10239":  # Check if taxid is 10239 (Viruses)
-                    data[sample][name]["old"] = int(n_reads_clade)
+                    data[sample][name]["new"] = int(n_reads_clade)
             with open(old_file, "r") as f:
                 next(f)
                 for line in f:
@@ -82,8 +69,7 @@ def collect_data():
                         n_reads_direct,
                         n_reads_clade,
                     ) = line.strip().split("\t")
-                    # if taxid == "10239":  # Check if taxid is 10239 (Viruses)
-                    data[sample][name]["new"] = int(n_reads_clade)
+                    data[sample][name]["old"] = int(n_reads_clade)
 
     differences = defaultdict(lambda: defaultdict(list))
     for sample in data.keys():
@@ -91,10 +77,10 @@ def collect_data():
             new_count = data[sample][name].get("new", 0)
             old_count = data[sample][name].get("old", 0)
             differences[sample][name] = int(new_count - old_count)
-    return differences
+    return data, differences
 
 
-differences = collect_data()
+data, differences = collect_data()
 
 df = pd.DataFrame(differences).transpose()
 
@@ -105,9 +91,9 @@ def collect_data():
     row = df.loc[sample]
     sorted_row = row.abs().sort_values(ascending=False)
 
-    top_5 = sorted_row.head(15)
+    top_15 = sorted_row.head(15)
 
     print(f"\nSample: {sample}")
-    for name in top_5.index:
+    for name in top_15.index:
         value = row[name]
-        print(f"{name}: {value}")
+        print(f"{name}: {value} ({data[sample][name].get('new', 0)})")