Skip to content

Commit

Permalink
Update error message and docs for features argument to clarify Cell…
Browse files Browse the repository at this point in the history
…Profiler default expectations and how to handle non-CellProfiler data (#448)

* updated error message

* updated error messages in tests

* update metadata_features docstrings

* update docstrings for feature argument

* update features docstring and remove unused indent

* clarify error message to avoid confusing recommendation to update features parameter since it does not exist in the function

* update error message in tests

* Update pycytominer/consensus.py

Co-authored-by: Dave Bunten <[email protected]>

* Update pycytominer/cyto_utils/features.py

Co-authored-by: Dave Bunten <[email protected]>

* Update pycytominer/cyto_utils/features.py

Co-authored-by: Dave Bunten <[email protected]>

* Update pycytominer/normalize.py

Co-authored-by: Dave Bunten <[email protected]>

* Update pycytominer/cyto_utils/write_gct.py

Co-authored-by: Dave Bunten <[email protected]>

* made function docs multi-lined

* updated test with error messages

---------

Co-authored-by: Gregory Way <[email protected]>
Co-authored-by: Dave Bunten <[email protected]>
  • Loading branch information
3 people authored Sep 27, 2024
1 parent c6ad2a0 commit 2fd5ca3
Show file tree
Hide file tree
Showing 14 changed files with 50 additions and 37 deletions.
2 changes: 1 addition & 1 deletion pycytominer/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def consensus(
features : list
A list of strings corresponding to feature measurement column names in the
`profiles` DataFrame. All features listed must be found in `profiles`.
Defaults to "infer". If "infer", then assume cell painting features are those
Defaults to "infer". If "infer", then assume features are from CellProfiler output and
prefixed with "Cells", "Nuclei", or "Cytoplasm".
output_file : str, optional
If provided, will write consensus profiles to file. If not specified, will
Expand Down
17 changes: 12 additions & 5 deletions pycytominer/cyto_utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def infer_cp_features(
metadata=False,
image_features=False,
):
"""Given a dataframe, output features that we expect to be Cell Painting features.
"""Given CellProfiler output data read as a DataFrame, output feature column names as a list.
Parameters
----------
Expand All @@ -90,6 +90,8 @@ def infer_cp_features(
Compartments from which Cell Painting features were extracted.
metadata : bool, default False
Whether or not to infer metadata features.
If metadata is set to True, find column names that begin with the `Metadata_` prefix.
This convention is expected by CellProfiler defaults.
image_features : bool, default False
Whether or not the profiles contain image features.
Expand All @@ -115,9 +117,12 @@ def infer_cp_features(
population_df.columns.str.startswith("Metadata_")
].tolist()

assert ( # noqa: S101
len(features) > 0
), "No CP features found. Are you sure this dataframe is from CellProfiler?"
if len(features) == 0:
raise ValueError(
"No features or metadata found. Pycytominer expects CellProfiler column names by default. "
"If you're using non-CellProfiler data, please do not 'infer' features. "
"Instead, check if the function has a `features` or `meta_features` parameter, and input column names manually."
)

return features

Expand Down Expand Up @@ -150,7 +155,9 @@ def drop_outlier_features(
population_df : pandas.core.frame.DataFrame
DataFrame that includes metadata and observation features.
features : list of str or str, default "infer"
Features present in the population dataframe. If "infer", then assume Cell Painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_"
Features present in the population dataframe. If "infer",
then assume CellProfiler feature conventions
(start with "Cells_", "Nuclei_", or "Cytoplasm_")
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
Expand Down
7 changes: 4 additions & 3 deletions pycytominer/cyto_utils/modz.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,10 @@ def modz(
a string or list of column(s) in the population dataframe that
indicate replicate level information
features : list, default "infer"
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
A list of strings corresponding to feature measurement column names in the
`population_df` DataFrame. All features listed must be found in `population_df`.
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
method : str, default "spearman"
indicating which correlation metric to use.
min_weight : float, default 0.01
Expand Down
2 changes: 1 addition & 1 deletion pycytominer/cyto_utils/write_gct.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def write_gct(
features : list
A list of strings corresponding to feature measurement column names in the
`profiles` DataFrame. All features listed must be found in `profiles`.
Defaults to "infer". If "infer", then assume cell painting features are those
Defaults to "infer". If "infer", then assume features are from CellProfiler output and
prefixed with "Cells", "Nuclei", or "Cytoplasm".
meta_features : list
A list of strings corresponding to metadata column names in the `profiles`
Expand Down
4 changes: 2 additions & 2 deletions pycytominer/feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ def feature_select(
----------
profiles : pandas.core.frame.DataFrame or file
DataFrame or file of profiles.
features : list
features : list, default "infer"
A list of strings corresponding to feature measurement column names in the
`profiles` DataFrame. All features listed must be found in `profiles`.
Defaults to "infer". If "infer", then assume cell painting features are those
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
image_features: bool, default False
Whether the profiles contain image features.
Expand Down
7 changes: 4 additions & 3 deletions pycytominer/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,15 @@ def normalize(
features : list
A list of strings corresponding to feature measurement column names in the
`profiles` DataFrame. All features listed must be found in `profiles`.
Defaults to "infer". If "infer", then assume cell painting features are those
Defaults to "infer". If "infer", then assume features are from CellProfiler output and
prefixed with "Cells", "Nuclei", or "Cytoplasm".
image_features: bool, default False
Whether the profiles contain image features.
meta_features : list
A list of strings corresponding to metadata column names in the `profiles`
DataFrame. All features listed must be found in `profiles`. Defaults to "infer".
If "infer", then assume metadata features are those prefixed with "Metadata"
If "infer", then assume CellProfiler metadata features, identified by
column names that begin with the `Metadata_` prefix."
samples : str
The metadata column values to use as a normalization reference. We often use
control samples. The function uses a pd.query() function, so you should
Expand Down Expand Up @@ -114,7 +115,7 @@ def normalize(
normalized_df = normalize(
profiles=data_df,
features=["x", "y", "z", "zz"],
meta_features="infer",
meta_features=["Metadata_plate", "Metadata_treatment"],
samples="Metadata_treatment == 'control'",
method="standardize"
)
Expand Down
7 changes: 4 additions & 3 deletions pycytominer/operations/correlation_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ def correlation_threshold(
population_df : pandas.core.frame.DataFrame
DataFrame that includes metadata and observation features.
features : list, default "infer"
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
A list of strings corresponding to feature measurement column names in the
`population_df` DataFrame. All features listed must be found in `population_df`.
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
Expand Down
11 changes: 6 additions & 5 deletions pycytominer/operations/get_na_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):
population_df : pandas.core.frame.DataFrame
DataFrame that includes metadata and observation features.
features : list, default "infer"
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
A list of strings corresponding to feature measurement column names in the
`profiles` DataFrame. All features listed must be found in `profiles`.
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
Expand All @@ -36,8 +37,8 @@ def get_na_columns(population_df, features="infer", samples="all", cutoff=0.05):

if features == "infer":
features = infer_cp_features(population_df)
else:
population_df = population_df.loc[:, features]

population_df = population_df.loc[:, features]

num_rows = population_df.shape[0]
na_prop_df = population_df.isna().sum() / num_rows
Expand Down
7 changes: 4 additions & 3 deletions pycytominer/operations/noise_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ def noise_removal(
The list of unique perturbations corresponding to the rows in population_df. For example,
perturb1_well1 and perturb1_well2 would both be "perturb1".
features : list, default "infer"
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
A list of strings corresponding to feature measurement column names in the
`population_df` DataFrame. All features listed must be found in `population_df`.
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
Expand Down
7 changes: 4 additions & 3 deletions pycytominer/operations/variance_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ def variance_threshold(
population_df : pandas.core.frame.DataFrame
DataFrame that includes metadata and observation features.
features : list, default "infer"
List of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_".
A list of strings corresponding to feature measurement column names in the
`population_df` DataFrame. All features listed must be found in `population_df`.
Defaults to "infer". If "infer", then assume CellProfiler features are those
prefixed with "Cells", "Nuclei", or "Cytoplasm".
samples : str, default "all"
List of samples to perform operation on. The function uses a pd.DataFrame.query()
function, so you should structure samples in this fashion. An example is
Expand Down
4 changes: 2 additions & 2 deletions tests/test_cyto_utils/test_feature_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ def test_feature_infer():


def test_feature_infer_nocp():
with pytest.raises(AssertionError) as nocp:
with pytest.raises(ValueError) as nocp:
infer_cp_features(population_df=non_cp_data_df)

assert "No CP features found." in str(nocp.value)
assert "No features or metadata found." in str(nocp.value)


def test_metadata_feature_infer():
Expand Down
4 changes: 2 additions & 2 deletions tests/test_operations/test_correlation_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_correlation_threshold_samples():


def test_correlation_threshold_featureinfer():
with pytest.raises(AssertionError) as nocp:
with pytest.raises(ValueError) as nocp:
correlation_threshold_result = correlation_threshold(
population_df=data_df,
features="infer",
Expand All @@ -84,7 +84,7 @@ def test_correlation_threshold_featureinfer():
method="pearson",
)

assert "No CP features found." in str(nocp.value)
assert "No features found." in str(nocp.value)

data_cp_df = data_df.copy()
data_cp_df.columns = [f"Cells_{x}" for x in data_df.columns]
Expand Down
4 changes: 2 additions & 2 deletions tests/test_operations/test_get_na_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ def test_get_na_columns_sample():


def test_get_na_columns_featureinfer():
with pytest.raises(AssertionError) as nocp:
with pytest.raises(ValueError) as nocp:
get_na_columns(
population_df=data_df, samples="all", features="infer", cutoff=0.1
)

assert "No CP features found." in str(nocp.value)
assert "No features found." in str(nocp.value)
4 changes: 2 additions & 2 deletions tests/test_operations/test_variance_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,12 @@ def test_variance_threshold():

def test_variance_threshold_featureinfer():
unique_cut = 0.01
with pytest.raises(AssertionError) as nocp:
with pytest.raises(ValueError) as nocp:
excluded_features = variance_threshold(
population_df=data_unique_test_df, features="infer", unique_cut=unique_cut
)

assert "No CP features found." in str(nocp.value)
assert "No features found." in str(nocp.value)

data_cp_df = data_unique_test_df.copy()
data_cp_df.columns = [f"Cells_{x}" for x in data_unique_test_df.columns]
Expand Down

0 comments on commit 2fd5ca3

Please sign in to comment.