Skip to content

Commit

Permalink
fix(steps): add unit test and fix unique col scaling (#158)
Browse files Browse the repository at this point in the history
  • Loading branch information
jitingxu1 authored Sep 25, 2024
1 parent b8aebcb commit c32d604
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 2 deletions.
13 changes: 11 additions & 2 deletions ibis_ml/steps/_standardize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from collections.abc import Iterable

_DOCS_PAGE_NAME = "standardization"
# a small epsilon value to handle near-constant columns during normalization
_APPROX_EPS = 10e-7


class ScaleMinMax(Step):
Expand Down Expand Up @@ -68,7 +70,11 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
def transform_table(self, table: ir.Table) -> ir.Table:
return table.mutate(
[
((table[c] - min) / (max - min)).name(c) # type: ignore
# for near-constant column, set the scale to 1.0
(
(table[c] - min)
/ (1.0 if abs(max - min) < _APPROX_EPS else max - min)
).name(c)
for c, (max, min) in self.stats_.items()
]
)
Expand Down Expand Up @@ -128,7 +134,10 @@ def fit_table(self, table: ir.Table, metadata: Metadata) -> None:
def transform_table(self, table: ir.Table) -> ir.Table:
return table.mutate(
[
((table[c] - center) / scale).name(c) # type: ignore
# for near-constant column, set the scale to 1.0
(
(table[c] - center) / (1.0 if abs(scale) < _APPROX_EPS else scale)
).name(c)
for c, (center, scale) in self.stats_.items()
]
)
42 changes: 42 additions & 0 deletions tests/test_standardize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import ibis
import numpy as np
import pandas as pd
import pandas.testing as tm
import pytest

import ibis_ml as ml


def test_scalestandard():
cols = np.arange(0, 100)
mean = np.mean(cols)
std = np.std(cols)
table = ibis.memtable({"col": cols})
step = ml.ScaleStandard("col")
step.fit_table(table, ml.core.Metadata())
result = step.transform_table(table)
expected = pd.DataFrame({"col": (cols - mean) / std})
tm.assert_frame_equal(result.execute(), expected, check_exact=False)


def test_scaleminmax():
cols = np.arange(0, 100)
min_val = np.min(cols)
max_val = np.max(cols)
table = ibis.memtable({"col": cols})
step = ml.ScaleMinMax("col")
step.fit_table(table, ml.core.Metadata())
result = step.transform_table(table)
expected = pd.DataFrame({"col": (cols - min_val) / (max_val - min_val)})
tm.assert_frame_equal(result.execute(), expected, check_exact=False)


@pytest.mark.parametrize("scaler", ["ScaleStandard", "ScaleMinMax"])
def test_constant_columns(scaler):
table = ibis.memtable({"int_col": [100], "float_col": [100.0]})
scaler_class = getattr(ml, scaler)
scale_step = scaler_class(ml.numeric())
scale_step.fit_table(table, ml.core.Metadata())
result = scale_step.transform_table(table)
expected = pd.DataFrame({"int_col": [0.0], "float_col": [0.0]})
tm.assert_frame_equal(result.execute(), expected)

0 comments on commit c32d604

Please sign in to comment.