Skip to content

Commit

Permalink
feat: Load the necessary algorithms dynamically
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez committed Nov 8, 2024
1 parent c82c887 commit 85dc956
Show file tree
Hide file tree
Showing 21 changed files with 146 additions and 120 deletions.
28 changes: 26 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,40 @@ adding new matching methods, or expanding support for additional standards.
See the instructions below to get started!


Formatting the Code
-------------------

We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter.
The CI runs for every pull request and will fail if code is not properly formatted.
To make sure formatting is correct, you can do the following steps.

Make sure you have black installed:
```
pip install black
```

To format the code, anyone can use the command before committing your changes:
```
make format
```

Or you can use the black command directly:
```
black ./bdikit/
```


Adding New Matching Methods
---------------------------

Contributors can add new methods for schema and value matching by following these steps:

1. Create a Python module inside the `algorithms` folder (e.g., `bdikit/value_matching/algorithms`).
1. Create a Python module inside the "task folder" folder (e.g., `bdikit/value_matching`).

2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching).

3. Instantiate an object of your class in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Ensure your module is properly imported in the `__init__.py` file (e.g.,` bdikit/value_matching/__init__.py`).
3. Add a new entry in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Make sure to add the correct import path for your
module to ensure it can be accessed without errors.


Code of Conduct
Expand Down
20 changes: 2 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ The `bdi-kit` is a library that assist users in performing data harmonization. I

**Warning:** `bdi-kit` is currently in *alpha* stage and under heavy development. Expect APIs to change.


## Documentation

Documentation is available at [https://bdi-kit.readthedocs.io/](https://bdi-kit.readthedocs.io/).
Expand All @@ -36,21 +37,4 @@ pip install git+https://github.com/VIDA-NYU/bdi-kit@devel

## Contributing

We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter.
The CI runs for every pull request and will fail if code is not properly formatted.
To make sure formatting is correct, you can do the following steps.

Make sure you have black installed:
```
pip install black
```

To format the code, anyone can use the command before committing your changes:
```
make format
```

Or you can use the black command directly:
```
black ./bdikit/
```
To learn more about making a contribution to bdi-kit, please see our [Contributing guide](./CONTRIBUTING.md).
6 changes: 3 additions & 3 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def match_schema(
if isinstance(method, str):
if method_args is None:
method_args = {}
matcher_instance = SchemaMatchers.get_instance(method, **method_args)
matcher_instance = SchemaMatchers.get_matcher(method, **method_args)
elif isinstance(method, BaseSchemaMatcher):
matcher_instance = method
else:
Expand Down Expand Up @@ -132,7 +132,7 @@ def top_matches(
if isinstance(method, str):
if method_args is None:
method_args = {}
topk_matcher = TopkMatchers.get_instance(method, **method_args)
topk_matcher = TopkMatchers.get_matcher(method, **method_args)
elif isinstance(method, BaseTopkSchemaMatcher):
topk_matcher = method
else:
Expand Down Expand Up @@ -343,7 +343,7 @@ def _match_values(
target_domain, column_mapping_list = _format_value_matching_input(
source, target, column_mapping
)
value_matcher = ValueMatchers.get_instance(method, **method_args)
value_matcher = ValueMatchers.get_matcher(method, **method_args)
mapping_results: List[ValueMatchingResult] = []

for mapping in column_mapping_list:
Expand Down
5 changes: 0 additions & 5 deletions bdikit/schema_matching/best/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +0,0 @@
from bdikit.schema_matching.best.algorithms.valentine import *
from bdikit.schema_matching.best.algorithms.gpt import *
from bdikit.schema_matching.best.algorithms.contrastivelearning import *
from bdikit.schema_matching.best.algorithms.twophase import *
from bdikit.schema_matching.best.algorithms.maxvalsim import *
1 change: 0 additions & 1 deletion bdikit/schema_matching/best/algorithms/__init__.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import pandas as pd

from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL
from bdikit.schema_matching.topk.algorithms.contrastivelearning import (
CLTopkSchemaMatcher,
)
from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher


class ContrastiveLearningSchemaMatcher(BaseSchemaMatcher):
Expand Down
File renamed without changes.
84 changes: 51 additions & 33 deletions bdikit/schema_matching/best/matcher_factory.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,63 @@
import importlib
from enum import Enum
from typing import Mapping, Any, Type
from typing import Mapping, Any
from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.best import (
SimFloodSchemaMatcher,
ComaSchemaMatcher,
CupidSchemaMatcher,
DistributionBasedSchemaMatcher,
JaccardSchemaMatcher,
GPTSchemaMatcher,
ContrastiveLearningSchemaMatcher,
TwoPhaseSchemaMatcher,
MaxValSimSchemaMatcher,
)


class SchemaMatchers(Enum):
SIMFLOOD = ("similarity_flooding", SimFloodSchemaMatcher)
COMA = ("coma", ComaSchemaMatcher)
CUPID = ("cupid", CupidSchemaMatcher)
DISTRIBUTION_BASED = ("distribution_based", DistributionBasedSchemaMatcher)
JACCARD_DISTANCE = ("jaccard_distance", JaccardSchemaMatcher)
GPT = ("gpt", GPTSchemaMatcher)
CT_LEARNING = ("ct_learning", ContrastiveLearningSchemaMatcher)
TWO_PHASE = ("two_phase", TwoPhaseSchemaMatcher)
MAX_VAL_SIM = ("max_val_sim", MaxValSimSchemaMatcher)
SIMFLOOD = (
"similarity_flooding",
"bdikit.schema_matching.best.valentine.SimFloodSchemaMatcher",
)
COMA = (
"coma",
"bdikit.schema_matching.best.valentine.ComaSchemaMatcher",
)
CUPID = (
"cupid",
"bdikit.schema_matching.best.valentine.CupidSchemaMatcher",
)
DISTRIBUTION_BASED = (
"distribution_based",
"bdikit.schema_matching.best.valentine.DistributionBasedSchemaMatcher",
)
JACCARD_DISTANCE = (
"jaccard_distance",
"bdikit.schema_matching.best.valentine.JaccardDistanceSchemaMatcher",
)
GPT = ("gpt", "bdikit.schema_matching.best.gpt.GPTSchemaMatcher")
CT_LEARNING = (
"ct_learning",
"bdikit.schema_matching.best.contrastivelearning.ContrastiveLearningSchemaMatcher",
)
TWO_PHASE = (
"two_phase",
"bdikit.schema_matching.best.twophase.TwoPhaseSchemaMatcher",
)
MAX_VAL_SIM = (
"max_val_sim",
"bdikit.schema_matching.best.maxvalsim.MaxValSimSchemaMatcher",
)

def __init__(self, method_name: str, method_class: Type[BaseSchemaMatcher]):
self.method_name = method_name
self.method_class = method_class
def __init__(self, matcher_name: str, matcher_path: str):
self.matcher_name = matcher_name
self.matcher_path = matcher_path

@staticmethod
def get_instance(
method_name: str, **method_kwargs: Mapping[str, Any]
def get_matcher(
matcher_name: str, **matcher_kwargs: Mapping[str, Any]
) -> BaseSchemaMatcher:
methods = {method.method_name: method.method_class for method in SchemaMatchers}

try:
return methods[method_name](**method_kwargs)
except KeyError:
names = ", ".join(list(methods.keys()))
if matcher_name not in matchers:
names = ", ".join(list(matchers.keys()))
raise ValueError(
f"The {method_name} algorithm is not supported. "
f"The {matcher_name} algorithm is not supported. "
f"Supported algorithms are: {names}"
)
# Load the class dynamically
module_path, class_name = matchers[matcher_name].rsplit(".", 1)
module = importlib.import_module(module_path)

return getattr(module, class_name)(**matcher_kwargs)


matchers = {method.matcher_name: method.matcher_path for method in SchemaMatchers}
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@
from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL
from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
from bdikit.schema_matching.topk.algorithms.contrastivelearning import (
CLTopkSchemaMatcher,
)
from bdikit.value_matching.algorithms.polyfuzz import TFIDFValueMatcher
from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher
from bdikit.value_matching.polyfuzz import TFIDFValueMatcher
from bdikit.value_matching.base import BaseValueMatcher


Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
import pandas as pd
from typing import Optional

from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.best.algorithms.valentine import SimFloodSchemaMatcher
from bdikit.schema_matching.best.valentine import SimFloodSchemaMatcher
from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL
from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
from bdikit.schema_matching.topk.algorithms.contrastivelearning import (
CLTopkSchemaMatcher,
)
from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher


class TwoPhaseSchemaMatcher(BaseSchemaMatcher):
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion bdikit/schema_matching/topk/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from bdikit.schema_matching.topk.algorithms.contrastivelearning import *

36 changes: 22 additions & 14 deletions bdikit/schema_matching/topk/matcher_factory.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,34 @@
import importlib
from enum import Enum
from typing import Mapping, Any, Type
from typing import Mapping, Any
from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
from bdikit.schema_matching.topk import CLTopkSchemaMatcher


class TopkMatchers(Enum):
CT_LEARNING = ("ct_learning", CLTopkSchemaMatcher)
CT_LEARNING = (
"ct_learning",
"bdikit.schema_matching.topk.contrastivelearning.CLTopkSchemaMatcher",
)

def __init__(self, method_name: str, method_class: Type[BaseTopkSchemaMatcher]):
self.method_name = method_name
self.method_class = method_class
def __init__(self, matcher_name: str, matcher_path: str):
self.matcher_name = matcher_name
self.matcher_path = matcher_path

@staticmethod
def get_instance(
method_name: str, **method_kwargs: Mapping[str, Any]
def get_matcher(
matcher_name: str, **matcher_kwargs: Mapping[str, Any]
) -> BaseTopkSchemaMatcher:
methods = {method.method_name: method.method_class for method in TopkMatchers}
try:
return methods[method_name](**method_kwargs)
except KeyError:
names = ", ".join(list(methods.keys()))
if matcher_name not in matchers:
names = ", ".join(list(matchers.keys()))
raise ValueError(
f"The {method_name} algorithm is not supported. "
f"The {matcher_name} algorithm is not supported. "
f"Supported algorithms are: {names}"
)
# Load the class dynamically
module_path, class_name = matchers[matcher_name].rsplit(".", 1)
module = importlib.import_module(module_path)

return getattr(module, class_name)(**matcher_kwargs)


matchers = {method.matcher_name: method.matcher_path for method in TopkMatchers}
2 changes: 0 additions & 2 deletions bdikit/value_matching/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
from bdikit.value_matching.algorithms.polyfuzz import *
from bdikit.value_matching.algorithms.gpt import *
Empty file.
File renamed without changes.
56 changes: 32 additions & 24 deletions bdikit/value_matching/matcher_factory.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,44 @@
import importlib
from enum import Enum
from typing import Mapping, Any, Type
from typing import Mapping, Any
from bdikit.value_matching.base import BaseValueMatcher
from bdikit.value_matching import (
GPTValueMatcher,
TFIDFValueMatcher,
EditDistanceValueMatcher,
EmbeddingValueMatcher,
FastTextValueMatcher,
)


class ValueMatchers(Enum):
TFIDF = ("tfidf", TFIDFValueMatcher)
EDIT = ("edit_distance", EditDistanceValueMatcher)
EMBEDDINGS = ("embedding", EmbeddingValueMatcher)
FASTTEXT = ("fasttext", FastTextValueMatcher)
GPT = ("gpt", GPTValueMatcher)
TFIDF = ("tfidf", "bdikit.value_matching.polyfuzz.TFIDFValueMatcher")
EDIT = (
"edit_distance",
"bdikit.value_matching.polyfuzz.EditDistanceValueMatcher",
)
EMBEDDINGS = (
"embedding",
"bdikit.value_matching.polyfuzz.EmbeddingValueMatcher",
)
FASTTEXT = (
"fasttext",
"bdikit.value_matching.polyfuzz.FastTextValueMatcher",
)
GPT = ("gpt", "bdikit.value_matching.gpt.GPTValueMatcher")

def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]):
self.method_name = method_name
self.method_class = method_class
def __init__(self, matcher_name: str, matcher_path: str):
self.matcher_name = matcher_name
self.matcher_path = matcher_path

@staticmethod
def get_instance(
method_name: str, **method_kwargs: Mapping[str, Any]
def get_matcher(
matcher_name: str, **matcher_kwargs: Mapping[str, Any]
) -> BaseValueMatcher:
methods = {method.method_name: method.method_class for method in ValueMatchers}
try:
return methods[method_name](**method_kwargs)
except KeyError:
names = ", ".join(list(methods.keys()))
if matcher_name not in matchers:
names = ", ".join(list(matchers.keys()))
raise ValueError(
f"The {method_name} algorithm is not supported. "
f"The {matcher_name} algorithm is not supported. "
f"Supported algorithms are: {names}"
)
# Load the class dynamically
module_path, class_name = matchers[matcher_name].rsplit(".", 1)
module = importlib.import_module(module_path)

return getattr(module, class_name)(**matcher_kwargs)


matchers = {method.matcher_name: method.matcher_path for method in ValueMatchers}
File renamed without changes.
6 changes: 3 additions & 3 deletions tests/test_schema_matching.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import pandas as pd
from bdikit.schema_matching.best import (
from bdikit.schema_matching.best.valentine import (
SimFloodSchemaMatcher,
JaccardSchemaMatcher,
DistributionBasedSchemaMatcher,
ComaSchemaMatcher,
CupidSchemaMatcher,
TwoPhaseSchemaMatcher,
ContrastiveLearningSchemaMatcher,
)
from bdikit.schema_matching.best.twophase import TwoPhaseSchemaMatcher
from bdikit.schema_matching.best.contrastivelearning import ContrastiveLearningSchemaMatcher


def test_basic_column_mapping_algorithms():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_value_matching.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest
import pandas as pd
from bdikit.value_matching import (
from bdikit.value_matching.polyfuzz import (
TFIDFValueMatcher,
EditDistanceValueMatcher,
)
Expand Down

0 comments on commit 85dc956

Please sign in to comment.