Update error message and docs for features argument to clarify Cell…

…Profiler default expectations and how to handle non-CellProfiler data (#448) * updated error message * updated error messages in tests * update metadata_features docstrings * update docstrings for feature argument * update features docstring and remove unused indent * clarify error message to avoid confusing recommendation to update features parameter since it does not exist in the function * update error message in tests * Update pycytominer/consensus.py Co-authored-by: Dave Bunten <[email protected]> * Update pycytominer/cyto_utils/features.py Co-authored-by: Dave Bunten <[email protected]> * Update pycytominer/cyto_utils/features.py Co-authored-by: Dave Bunten <[email protected]> * Update pycytominer/normalize.py Co-authored-by: Dave Bunten <[email protected]> * Update pycytominer/cyto_utils/write_gct.py Co-authored-by: Dave Bunten <[email protected]> * made function docs multi-lined * updated test with error messages --------- Co-authored-by: Gregory Way <[email protected]> Co-authored-by: Dave Bunten <[email protected]>
cytomining · Sep 27, 2024 · 2fd5ca3 · 2fd5ca3
1 parent c6ad2a0
commit 2fd5ca3
Show file tree

Hide file tree

Showing 14 changed files with 50 additions and 37 deletions.
diff --git a/pycytominer/consensus.py b/pycytominer/consensus.py
@@ -35,7 +35,7 @@ def consensus(
     features : list
         A list of strings corresponding to feature measurement column names in the
         `profiles` DataFrame. All features listed must be found in `profiles`.
-        Defaults to "infer". If "infer", then assume cell painting features are those
+        Defaults to "infer". If "infer", then assume features are from CellProfiler output and
         prefixed with "Cells", "Nuclei", or "Cytoplasm".
     output_file : str, optional
         If provided, will write consensus profiles to file. If not specified, will

diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py
@@ -80,7 +80,7 @@ def infer_cp_features(
     metadata=False,
     image_features=False,
 ):
-    """Given a dataframe, output features that we expect to be Cell Painting features.
+    """Given CellProfiler output data read as a DataFrame, output feature column names as a list.
 
     Parameters
     ----------
@@ -90,6 +90,8 @@ def infer_cp_features(
         Compartments from which Cell Painting features were extracted.
     metadata : bool, default False
         Whether or not to infer metadata features.
+        If metadata is set to True, find column names that begin with the `Metadata_` prefix.
+        This convention is expected by CellProfiler defaults.
     image_features : bool, default False
         Whether or not the profiles contain image features.
 
@@ -115,9 +117,12 @@ def infer_cp_features(
             population_df.columns.str.startswith("Metadata_")
         ].tolist()
 
-    assert (  # noqa: S101
-        len(features) > 0
-    ), "No CP features found. Are you sure this dataframe is from CellProfiler?"
+    if len(features) == 0:
+        raise ValueError(
+            "No features or metadata found. Pycytominer expects CellProfiler column names by default. "
+            "If you're using non-CellProfiler data, please do not 'infer' features. "
+            "Instead, check if the function has a `features` or `meta_features` parameter, and input column names manually."
+        )
 
     return features
 
@@ -150,7 +155,9 @@ def drop_outlier_features(
     population_df : pandas.core.frame.DataFrame
         DataFrame that includes metadata and observation features.
     features : list of str or str, default "infer"
-        Features present in the population dataframe. If "infer", then assume Cell Painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_"
+        Features present in the population dataframe. If "infer",
+        then assume CellProfiler feature conventions
+        (start with "Cells_", "Nuclei_", or "Cytoplasm_")
     samples : str, default "all"
         List of samples to perform operation on. The function uses a pd.DataFrame.query()
         function, so you should  structure samples in this fashion. An example is

diff --git a/pycytominer/cyto_utils/modz.py b/pycytominer/cyto_utils/modz.py
@@ -98,9 +98,10 @@ def modz(
         a string or list of column(s) in the population dataframe that
         indicate replicate level information
     features : list, default "infer"
-         List of features present in the population dataframe [default: "infer"]
-         if "infer", then assume cell painting features are those that start with
-         "Cells_", "Nuclei_", or "Cytoplasm_".
+        A list of strings corresponding to feature measurement column names in the
+        `population_df` DataFrame. All features listed must be found in `population_df`.
+        Defaults to "infer". If "infer", then assume CellProfiler features are those
+        prefixed with "Cells", "Nuclei", or "Cytoplasm".
     method : str, default "spearman"
         indicating which correlation metric to use.
     min_weight : float, default 0.01

diff --git a/pycytominer/cyto_utils/write_gct.py b/pycytominer/cyto_utils/write_gct.py
@@ -32,7 +32,7 @@ def write_gct(
     features : list
         A list of strings corresponding to feature measurement column names in the
         `profiles` DataFrame. All features listed must be found in `profiles`.
-        Defaults to "infer". If "infer", then assume cell painting features are those
+        Defaults to "infer". If "infer", then assume features are from CellProfiler output and
         prefixed with "Cells", "Nuclei", or "Cytoplasm".
     meta_features : list
         A list of strings corresponding to metadata column names in the `profiles`

diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py
@@ -43,10 +43,10 @@ def feature_select(
     ----------
     profiles : pandas.core.frame.DataFrame or file
         DataFrame or file of profiles.
-    features : list
+    features : list, default "infer"
         A list of strings corresponding to feature measurement column names in the
         `profiles` DataFrame. All features listed must be found in `profiles`.
-        Defaults to "infer". If "infer", then assume cell painting features are those
+        Defaults to "infer". If "infer", then assume CellProfiler features are those
         prefixed with "Cells", "Nuclei", or "Cytoplasm".
     image_features: bool, default False
         Whether the profiles contain image features.

diff --git a/pycytominer/normalize.py b/pycytominer/normalize.py
@@ -34,14 +34,15 @@ def normalize(
     features : list
         A list of strings corresponding to feature measurement column names in the
         `profiles` DataFrame. All features listed must be found in `profiles`.
-        Defaults to "infer". If "infer", then assume cell painting features are those
+        Defaults to "infer". If "infer", then assume features are from CellProfiler output and
         prefixed with "Cells", "Nuclei", or "Cytoplasm".
     image_features: bool, default False
         Whether the profiles contain image features.
     meta_features : list
         A list of strings corresponding to metadata column names in the `profiles`
         DataFrame. All features listed must be found in `profiles`. Defaults to "infer".
-        If "infer", then assume metadata features are those prefixed with "Metadata"
+        If "infer", then assume CellProfiler metadata features, identified by
+        column names that begin with the `Metadata_` prefix."
     samples : str
         The metadata column values to use as a normalization reference. We often use
         control samples. The function uses a pd.query() function, so you should
@@ -114,7 +115,7 @@ def normalize(
     normalized_df = normalize(
         profiles=data_df,
         features=["x", "y", "z", "zz"],
-        meta_features="infer",
+        meta_features=["Metadata_plate", "Metadata_treatment"],
         samples="Metadata_treatment == 'control'",
         method="standardize"
     )

diff --git a/pycytominer/operations/correlation_threshold.py b/pycytominer/operations/correlation_threshold.py
@@ -20,9 +20,10 @@ def correlation_threshold(
     population_df : pandas.core.frame.DataFrame
         DataFrame that includes metadata and observation features.
     features : list, default "infer"
-         List of features present in the population dataframe [default: "infer"]
-         if "infer", then assume cell painting features are those that start with
-         "Cells_", "Nuclei_", or "Cytoplasm_".
+        A list of strings corresponding to feature measurement column names in the
+        `population_df` DataFrame. All features listed must be found in `population_df`.
+        Defaults to "infer". If "infer", then assume CellProfiler features are those
+        prefixed with "Cells", "Nuclei", or "Cytoplasm".
     samples : str, default "all"
         List of samples to perform operation on. The function uses a pd.DataFrame.query()
         function, so you should  structure samples in this fashion. An example is

diff --git a/pycytominer/operations/get_na_columns.py b/pycytominer/operations/get_na_columns.py
@@ -14,9 +14,10 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):
     population_df : pandas.core.frame.DataFrame
         DataFrame that includes metadata and observation features.
     features : list, default "infer"
-         List of features present in the population dataframe [default: "infer"]
-         if "infer", then assume cell painting features are those that start with
-         "Cells_", "Nuclei_", or "Cytoplasm_".
+        A list of strings corresponding to feature measurement column names in the
+        `profiles` DataFrame. All features listed must be found in `profiles`.
+        Defaults to "infer". If "infer", then assume CellProfiler features are those
+        prefixed with "Cells", "Nuclei", or "Cytoplasm".
     samples : str, default "all"
         List of samples to perform operation on. The function uses a pd.DataFrame.query()
         function, so you should  structure samples in this fashion. An example is
@@ -36,8 +37,8 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):
 
     if features == "infer":
         features = infer_cp_features(population_df)
-    else:
-        population_df = population_df.loc[:, features]
+
+    population_df = population_df.loc[:, features]
 
     num_rows = population_df.shape[0]
     na_prop_df = population_df.isna().sum() / num_rows

diff --git a/pycytominer/operations/noise_removal.py b/pycytominer/operations/noise_removal.py
@@ -22,9 +22,10 @@ def noise_removal(
         The list of unique perturbations corresponding to the rows in population_df. For example,
         perturb1_well1 and perturb1_well2 would both be "perturb1".
     features : list, default "infer"
-         List of features present in the population dataframe [default: "infer"]
-         if "infer", then assume cell painting features are those that start with
-         "Cells_", "Nuclei_", or "Cytoplasm_".
+        A list of strings corresponding to feature measurement column names in the
+        `population_df` DataFrame. All features listed must be found in `population_df`.
+        Defaults to "infer". If "infer", then assume CellProfiler features are those
+        prefixed with "Cells", "Nuclei", or "Cytoplasm".
     samples : str, default "all"
         List of samples to perform operation on. The function uses a pd.DataFrame.query()
         function, so you should  structure samples in this fashion. An example is

diff --git a/pycytominer/operations/variance_threshold.py b/pycytominer/operations/variance_threshold.py
@@ -18,9 +18,10 @@ def variance_threshold(
     population_df : pandas.core.frame.DataFrame
         DataFrame that includes metadata and observation features.
     features : list, default "infer"
-         List of features present in the population dataframe [default: "infer"]
-         if "infer", then assume cell painting features are those that start with
-         "Cells_", "Nuclei_", or "Cytoplasm_".
+        A list of strings corresponding to feature measurement column names in the
+        `population_df` DataFrame. All features listed must be found in `population_df`.
+        Defaults to "infer". If "infer", then assume CellProfiler features are those
+        prefixed with "Cells", "Nuclei", or "Cytoplasm".
     samples : str, default "all"
         List of samples to perform operation on. The function uses a pd.DataFrame.query()
         function, so you should  structure samples in this fashion. An example is

diff --git a/tests/test_cyto_utils/test_feature_infer.py b/tests/test_cyto_utils/test_feature_infer.py
@@ -39,10 +39,10 @@ def test_feature_infer():
 
 
 def test_feature_infer_nocp():
-    with pytest.raises(AssertionError) as nocp:
+    with pytest.raises(ValueError) as nocp:
         infer_cp_features(population_df=non_cp_data_df)
 
-    assert "No CP features found." in str(nocp.value)
+    assert "No features or metadata found." in str(nocp.value)
 
 
 def test_metadata_feature_infer():

diff --git a/tests/test_operations/test_correlation_threshold.py b/tests/test_operations/test_correlation_threshold.py
@@ -75,7 +75,7 @@ def test_correlation_threshold_samples():
 
 
 def test_correlation_threshold_featureinfer():
-    with pytest.raises(AssertionError) as nocp:
+    with pytest.raises(ValueError) as nocp:
         correlation_threshold_result = correlation_threshold(
             population_df=data_df,
             features="infer",
@@ -84,7 +84,7 @@ def test_correlation_threshold_featureinfer():
             method="pearson",
         )
 
-    assert "No CP features found." in str(nocp.value)
+        assert "No features found." in str(nocp.value)
 
     data_cp_df = data_df.copy()
     data_cp_df.columns = [f"Cells_{x}" for x in data_df.columns]

diff --git a/tests/test_operations/test_get_na_columns.py b/tests/test_operations/test_get_na_columns.py
@@ -67,9 +67,9 @@ def test_get_na_columns_sample():
 
 
 def test_get_na_columns_featureinfer():
-    with pytest.raises(AssertionError) as nocp:
+    with pytest.raises(ValueError) as nocp:
         get_na_columns(
             population_df=data_df, samples="all", features="infer", cutoff=0.1
         )
 
-    assert "No CP features found." in str(nocp.value)
+        assert "No features found." in str(nocp.value)
diff --git a/tests/test_operations/test_variance_threshold.py b/tests/test_operations/test_variance_threshold.py
@@ -102,12 +102,12 @@ def test_variance_threshold():
 
 def test_variance_threshold_featureinfer():
     unique_cut = 0.01
-    with pytest.raises(AssertionError) as nocp:
+    with pytest.raises(ValueError) as nocp:
         excluded_features = variance_threshold(
             population_df=data_unique_test_df, features="infer", unique_cut=unique_cut
         )
 
-    assert "No CP features found." in str(nocp.value)
+        assert "No features found." in str(nocp.value)
 
     data_cp_df = data_unique_test_df.copy()
     data_cp_df.columns = [f"Cells_{x}" for x in data_unique_test_df.columns]