feat: Load the necessary algorithms dynamically

VIDA-NYU · Nov 8, 2024 · 85dc956 · 85dc956
1 parent c82c887
commit 85dc956
Show file tree

Hide file tree

Showing 21 changed files with 146 additions and 120 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -7,16 +7,40 @@ adding new matching methods, or expanding support for additional standards.
 See the instructions below to get started!
 
 
+Formatting the Code
+-------------------
+
+We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter.
+The CI runs for every pull request and will fail if code is not properly formatted.
+To make sure formatting is correct, you can do the following steps.
+
+Make sure you have black installed:
+```
+pip install black
+```
+
+To format the code, anyone can use the command before committing your changes:
+```
+make format
+```
+
+Or you can use the black command directly:
+```
+black ./bdikit/
+```
+
+
 Adding New Matching Methods
 ---------------------------
 
 Contributors can add new methods for schema and value matching by following these steps:
 
-1. Create a Python module inside the `algorithms` folder (e.g., `bdikit/value_matching/algorithms`).
+1. Create a Python module inside the "task folder" folder (e.g., `bdikit/value_matching`).
 
 2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching).
 
-3. Instantiate an object of your class in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Ensure your module is properly imported in the `__init__.py` file (e.g.,` bdikit/value_matching/__init__.py`).
+3. Add a new entry in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Make sure to add the correct import path for your 
+module to ensure it can be accessed without errors.
 
 
 Code of Conduct

diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ The `bdi-kit` is a library that assist users in performing data harmonization. I
 
 **Warning:** `bdi-kit` is currently in *alpha* stage and under heavy development. Expect APIs to change.
 
+
 ## Documentation
 
 Documentation is available at [https://bdi-kit.readthedocs.io/](https://bdi-kit.readthedocs.io/).
@@ -36,21 +37,4 @@ pip install git+https://github.com/VIDA-NYU/bdi-kit@devel
 
 ## Contributing
 
-We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter.
-The CI runs for every pull request and will fail if code is not properly formatted.
-To make sure formatting is correct, you can do the following steps.
-
-Make sure you have black installed:
-```
-pip install black
-```
-
-To format the code, anyone can use the command before committing your changes:
-```
-make format
-```
-
-Or you can use the black command directly:
-```
-black ./bdikit/
-```
+To learn more about making a contribution to bdi-kit, please see our [Contributing guide](./CONTRIBUTING.md).
diff --git a/bdikit/api.py b/bdikit/api.py
@@ -74,7 +74,7 @@ def match_schema(
     if isinstance(method, str):
         if method_args is None:
             method_args = {}
-        matcher_instance = SchemaMatchers.get_instance(method, **method_args)
+        matcher_instance = SchemaMatchers.get_matcher(method, **method_args)
     elif isinstance(method, BaseSchemaMatcher):
         matcher_instance = method
     else:
@@ -132,7 +132,7 @@ def top_matches(
     if isinstance(method, str):
         if method_args is None:
             method_args = {}
-        topk_matcher = TopkMatchers.get_instance(method, **method_args)
+        topk_matcher = TopkMatchers.get_matcher(method, **method_args)
     elif isinstance(method, BaseTopkSchemaMatcher):
         topk_matcher = method
     else:
@@ -343,7 +343,7 @@ def _match_values(
     target_domain, column_mapping_list = _format_value_matching_input(
         source, target, column_mapping
     )
-    value_matcher = ValueMatchers.get_instance(method, **method_args)
+    value_matcher = ValueMatchers.get_matcher(method, **method_args)
     mapping_results: List[ValueMatchingResult] = []
 
     for mapping in column_mapping_list:

diff --git a/bdikit/schema_matching/best/__init__.py b/bdikit/schema_matching/best/__init__.py
@@ -1,5 +0,0 @@
-from bdikit.schema_matching.best.algorithms.valentine import *
-from bdikit.schema_matching.best.algorithms.gpt import *
-from bdikit.schema_matching.best.algorithms.contrastivelearning import *
-from bdikit.schema_matching.best.algorithms.twophase import *
-from bdikit.schema_matching.best.algorithms.maxvalsim import *

diff --git a/bdikit/schema_matching/best/algorithms/__init__.py b/bdikit/schema_matching/best/algorithms/__init__.py
diff --git a/...ng/best/algorithms/contrastivelearning.py → ...hema_matching/best/contrastivelearning.py b/...ng/best/algorithms/contrastivelearning.py → ...hema_matching/best/contrastivelearning.py
@@ -1,10 +1,7 @@
 import pandas as pd
-
 from bdikit.schema_matching.best.base import BaseSchemaMatcher
 from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL
-from bdikit.schema_matching.topk.algorithms.contrastivelearning import (
-    CLTopkSchemaMatcher,
-)
+from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher
 
 
 class ContrastiveLearningSchemaMatcher(BaseSchemaMatcher):

diff --git a/...it/schema_matching/best/algorithms/gpt.py → bdikit/schema_matching/best/gpt.py b/...it/schema_matching/best/algorithms/gpt.py → bdikit/schema_matching/best/gpt.py
diff --git a/bdikit/schema_matching/best/matcher_factory.py b/bdikit/schema_matching/best/matcher_factory.py
@@ -1,45 +1,63 @@
+import importlib
 from enum import Enum
-from typing import Mapping, Any, Type
+from typing import Mapping, Any
 from bdikit.schema_matching.best.base import BaseSchemaMatcher
-from bdikit.schema_matching.best import (
-    SimFloodSchemaMatcher,
-    ComaSchemaMatcher,
-    CupidSchemaMatcher,
-    DistributionBasedSchemaMatcher,
-    JaccardSchemaMatcher,
-    GPTSchemaMatcher,
-    ContrastiveLearningSchemaMatcher,
-    TwoPhaseSchemaMatcher,
-    MaxValSimSchemaMatcher,
-)
 
 
 class SchemaMatchers(Enum):
-    SIMFLOOD = ("similarity_flooding", SimFloodSchemaMatcher)
-    COMA = ("coma", ComaSchemaMatcher)
-    CUPID = ("cupid", CupidSchemaMatcher)
-    DISTRIBUTION_BASED = ("distribution_based", DistributionBasedSchemaMatcher)
-    JACCARD_DISTANCE = ("jaccard_distance", JaccardSchemaMatcher)
-    GPT = ("gpt", GPTSchemaMatcher)
-    CT_LEARNING = ("ct_learning", ContrastiveLearningSchemaMatcher)
-    TWO_PHASE = ("two_phase", TwoPhaseSchemaMatcher)
-    MAX_VAL_SIM = ("max_val_sim", MaxValSimSchemaMatcher)
+    SIMFLOOD = (
+        "similarity_flooding",
+        "bdikit.schema_matching.best.valentine.SimFloodSchemaMatcher",
+    )
+    COMA = (
+        "coma",
+        "bdikit.schema_matching.best.valentine.ComaSchemaMatcher",
+    )
+    CUPID = (
+        "cupid",
+        "bdikit.schema_matching.best.valentine.CupidSchemaMatcher",
+    )
+    DISTRIBUTION_BASED = (
+        "distribution_based",
+        "bdikit.schema_matching.best.valentine.DistributionBasedSchemaMatcher",
+    )
+    JACCARD_DISTANCE = (
+        "jaccard_distance",
+        "bdikit.schema_matching.best.valentine.JaccardDistanceSchemaMatcher",
+    )
+    GPT = ("gpt", "bdikit.schema_matching.best.gpt.GPTSchemaMatcher")
+    CT_LEARNING = (
+        "ct_learning",
+        "bdikit.schema_matching.best.contrastivelearning.ContrastiveLearningSchemaMatcher",
+    )
+    TWO_PHASE = (
+        "two_phase",
+        "bdikit.schema_matching.best.twophase.TwoPhaseSchemaMatcher",
+    )
+    MAX_VAL_SIM = (
+        "max_val_sim",
+        "bdikit.schema_matching.best.maxvalsim.MaxValSimSchemaMatcher",
+    )
 
-    def __init__(self, method_name: str, method_class: Type[BaseSchemaMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
+    def __init__(self, matcher_name: str, matcher_path: str):
+        self.matcher_name = matcher_name
+        self.matcher_path = matcher_path
 
     @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
+    def get_matcher(
+        matcher_name: str, **matcher_kwargs: Mapping[str, Any]
     ) -> BaseSchemaMatcher:
-        methods = {method.method_name: method.method_class for method in SchemaMatchers}
-
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
+        if matcher_name not in matchers:
+            names = ", ".join(list(matchers.keys()))
             raise ValueError(
-                f"The {method_name} algorithm is not supported. "
+                f"The {matcher_name} algorithm is not supported. "
                 f"Supported algorithms are: {names}"
             )
+        # Load the class dynamically
+        module_path, class_name = matchers[matcher_name].rsplit(".", 1)
+        module = importlib.import_module(module_path)
+
+        return getattr(module, class_name)(**matcher_kwargs)
+
+
+matchers = {method.matcher_name: method.matcher_path for method in SchemaMatchers}
diff --git a/...ema_matching/best/algorithms/maxvalsim.py → bdikit/schema_matching/best/maxvalsim.py b/...ema_matching/best/algorithms/maxvalsim.py → bdikit/schema_matching/best/maxvalsim.py
@@ -3,10 +3,8 @@
 from bdikit.schema_matching.best.base import BaseSchemaMatcher
 from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL
 from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
-from bdikit.schema_matching.topk.algorithms.contrastivelearning import (
-    CLTopkSchemaMatcher,
-)
-from bdikit.value_matching.algorithms.polyfuzz import TFIDFValueMatcher
+from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher
+from bdikit.value_matching.polyfuzz import TFIDFValueMatcher
 from bdikit.value_matching.base import BaseValueMatcher
 
 

diff --git a/...hema_matching/best/algorithms/twophase.py → bdikit/schema_matching/best/twophase.py b/...hema_matching/best/algorithms/twophase.py → bdikit/schema_matching/best/twophase.py
@@ -1,13 +1,10 @@
 import pandas as pd
 from typing import Optional
-
 from bdikit.schema_matching.best.base import BaseSchemaMatcher
-from bdikit.schema_matching.best.algorithms.valentine import SimFloodSchemaMatcher
+from bdikit.schema_matching.best.valentine import SimFloodSchemaMatcher
 from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL
 from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
-from bdikit.schema_matching.topk.algorithms.contrastivelearning import (
-    CLTopkSchemaMatcher,
-)
+from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher
 
 
 class TwoPhaseSchemaMatcher(BaseSchemaMatcher):

diff --git a/...ema_matching/best/algorithms/valentine.py → bdikit/schema_matching/best/valentine.py b/...ema_matching/best/algorithms/valentine.py → bdikit/schema_matching/best/valentine.py
diff --git a/bdikit/schema_matching/topk/__init__.py b/bdikit/schema_matching/topk/__init__.py
@@ -1 +1 @@
-from bdikit.schema_matching.topk.algorithms.contrastivelearning import *
+
diff --git a/...ng/topk/algorithms/contrastivelearning.py → ...hema_matching/topk/contrastivelearning.py b/...ng/topk/algorithms/contrastivelearning.py → ...hema_matching/topk/contrastivelearning.py
diff --git a/bdikit/schema_matching/topk/matcher_factory.py b/bdikit/schema_matching/topk/matcher_factory.py
@@ -1,26 +1,34 @@
+import importlib
 from enum import Enum
-from typing import Mapping, Any, Type
+from typing import Mapping, Any
 from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
-from bdikit.schema_matching.topk import CLTopkSchemaMatcher
 
 
 class TopkMatchers(Enum):
-    CT_LEARNING = ("ct_learning", CLTopkSchemaMatcher)
+    CT_LEARNING = (
+        "ct_learning",
+        "bdikit.schema_matching.topk.contrastivelearning.CLTopkSchemaMatcher",
+    )
 
-    def __init__(self, method_name: str, method_class: Type[BaseTopkSchemaMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
+    def __init__(self, matcher_name: str, matcher_path: str):
+        self.matcher_name = matcher_name
+        self.matcher_path = matcher_path
 
     @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
+    def get_matcher(
+        matcher_name: str, **matcher_kwargs: Mapping[str, Any]
     ) -> BaseTopkSchemaMatcher:
-        methods = {method.method_name: method.method_class for method in TopkMatchers}
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
+        if matcher_name not in matchers:
+            names = ", ".join(list(matchers.keys()))
             raise ValueError(
-                f"The {method_name} algorithm is not supported. "
+                f"The {matcher_name} algorithm is not supported. "
                 f"Supported algorithms are: {names}"
             )
+        # Load the class dynamically
+        module_path, class_name = matchers[matcher_name].rsplit(".", 1)
+        module = importlib.import_module(module_path)
+
+        return getattr(module, class_name)(**matcher_kwargs)
+
+
+matchers = {method.matcher_name: method.matcher_path for method in TopkMatchers}
diff --git a/bdikit/value_matching/__init__.py b/bdikit/value_matching/__init__.py
@@ -1,2 +0,0 @@
-from bdikit.value_matching.algorithms.polyfuzz import *
-from bdikit.value_matching.algorithms.gpt import *

diff --git a/bdikit/value_matching/algorithms/__init__.py b/bdikit/value_matching/algorithms/__init__.py
diff --git a/bdikit/value_matching/algorithms/gpt.py → bdikit/value_matching/gpt.py b/bdikit/value_matching/algorithms/gpt.py → bdikit/value_matching/gpt.py
diff --git a/bdikit/value_matching/matcher_factory.py b/bdikit/value_matching/matcher_factory.py
@@ -1,36 +1,44 @@
+import importlib
 from enum import Enum
-from typing import Mapping, Any, Type
+from typing import Mapping, Any
 from bdikit.value_matching.base import BaseValueMatcher
-from bdikit.value_matching import (
-    GPTValueMatcher,
-    TFIDFValueMatcher,
-    EditDistanceValueMatcher,
-    EmbeddingValueMatcher,
-    FastTextValueMatcher,
-)
 
 
 class ValueMatchers(Enum):
-    TFIDF = ("tfidf", TFIDFValueMatcher)
-    EDIT = ("edit_distance", EditDistanceValueMatcher)
-    EMBEDDINGS = ("embedding", EmbeddingValueMatcher)
-    FASTTEXT = ("fasttext", FastTextValueMatcher)
-    GPT = ("gpt", GPTValueMatcher)
+    TFIDF = ("tfidf", "bdikit.value_matching.polyfuzz.TFIDFValueMatcher")
+    EDIT = (
+        "edit_distance",
+        "bdikit.value_matching.polyfuzz.EditDistanceValueMatcher",
+    )
+    EMBEDDINGS = (
+        "embedding",
+        "bdikit.value_matching.polyfuzz.EmbeddingValueMatcher",
+    )
+    FASTTEXT = (
+        "fasttext",
+        "bdikit.value_matching.polyfuzz.FastTextValueMatcher",
+    )
+    GPT = ("gpt", "bdikit.value_matching.gpt.GPTValueMatcher")
 
-    def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
+    def __init__(self, matcher_name: str, matcher_path: str):
+        self.matcher_name = matcher_name
+        self.matcher_path = matcher_path
 
     @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
+    def get_matcher(
+        matcher_name: str, **matcher_kwargs: Mapping[str, Any]
     ) -> BaseValueMatcher:
-        methods = {method.method_name: method.method_class for method in ValueMatchers}
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
+        if matcher_name not in matchers:
+            names = ", ".join(list(matchers.keys()))
             raise ValueError(
-                f"The {method_name} algorithm is not supported. "
+                f"The {matcher_name} algorithm is not supported. "
                 f"Supported algorithms are: {names}"
             )
+        # Load the class dynamically
+        module_path, class_name = matchers[matcher_name].rsplit(".", 1)
+        module = importlib.import_module(module_path)
+
+        return getattr(module, class_name)(**matcher_kwargs)
+
+
+matchers = {method.matcher_name: method.matcher_path for method in ValueMatchers}
diff --git a/bdikit/value_matching/algorithms/polyfuzz.py → bdikit/value_matching/polyfuzz.py b/bdikit/value_matching/algorithms/polyfuzz.py → bdikit/value_matching/polyfuzz.py
diff --git a/tests/test_schema_matching.py b/tests/test_schema_matching.py
@@ -1,13 +1,13 @@
 import pandas as pd
-from bdikit.schema_matching.best import (
+from bdikit.schema_matching.best.valentine import (
     SimFloodSchemaMatcher,
     JaccardSchemaMatcher,
     DistributionBasedSchemaMatcher,
     ComaSchemaMatcher,
     CupidSchemaMatcher,
-    TwoPhaseSchemaMatcher,
-    ContrastiveLearningSchemaMatcher,
 )
+from bdikit.schema_matching.best.twophase import TwoPhaseSchemaMatcher
+from bdikit.schema_matching.best.contrastivelearning import ContrastiveLearningSchemaMatcher
 
 
 def test_basic_column_mapping_algorithms():

diff --git a/tests/test_value_matching.py b/tests/test_value_matching.py
@@ -1,6 +1,6 @@
 import unittest
 import pandas as pd
-from bdikit.value_matching import (
+from bdikit.value_matching.polyfuzz import (
     TFIDFValueMatcher,
     EditDistanceValueMatcher,
 )