feat: Restructure packages to streamline the addition of new algorithms

VIDA-NYU · Oct 21, 2024 · c82c887 · c82c887
1 parent b6386b6
commit c82c887
Show file tree

Hide file tree

Showing 31 changed files with 753 additions and 736 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,26 @@
+Contributing to bdi-kit
+=======================
+
+There are many ways to contribute to bdi-kit, such as improving the codebase, reporting 
+issues or bugs, enhancing the documentation, reviewing pull requests from other developers, 
+adding new matching methods, or expanding support for additional standards. 
+See the instructions below to get started!
+
+
+Adding New Matching Methods
+---------------------------
+
+Contributors can add new methods for schema and value matching by following these steps:
+
+1. Create a Python module inside the `algorithms` folder (e.g., `bdikit/value_matching/algorithms`).
+
+2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching).
+
+3. Instantiate an object of your class in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Ensure your module is properly imported in the `__init__.py` file (e.g.,` bdikit/value_matching/__init__.py`).
+
+
+Code of Conduct
+---------------
+
+We abide by the principles of openness, respect, and consideration of others
+of the Python Software Foundation: https://www.python.org/psf/codeofconduct/.
diff --git a/bdikit/api.py b/bdikit/api.py
@@ -1,19 +1,16 @@
 from __future__ import annotations
 import logging
-from enum import Enum
+
 from collections import defaultdict
 from os.path import join, dirname
 from typing import (
     Union,
-    Type,
     List,
     Dict,
     TypedDict,
-    Set,
     Optional,
     Tuple,
     Callable,
-    Mapping,
     Any,
 )
 import itertools
@@ -22,37 +19,15 @@
 import panel as pn
 from IPython.display import display, Markdown
 from bdikit.utils import get_gdc_data, get_gdc_metadata
-from bdikit.mapping_algorithms.column_mapping.algorithms import (
-    BaseSchemaMatcher,
-    SimFloodSchemaMatcher,
-    ComaSchemaMatcher,
-    CupidSchemaMatcher,
-    DistributionBasedSchemaMatcher,
-    JaccardSchemaMatcher,
-    GPTSchemaMatcher,
-    ContrastiveLearningSchemaMatcher,
-    TwoPhaseSchemaMatcher,
-    MaxValSimSchemaMatcher,
-)
-from bdikit.mapping_algorithms.value_mapping.value_mappers import ValueMapper
-from bdikit.models.contrastive_learning.cl_api import (
-    DEFAULT_CL_MODEL,
-)
-from bdikit.mapping_algorithms.column_mapping.topk_matchers import (
-    TopkColumnMatcher,
-    CLTopkColumnMatcher,
-)
-from bdikit.mapping_algorithms.value_mapping.algorithms import (
-    ValueMatch,
-    BaseValueMatcher,
-    TFIDFValueMatcher,
-    GPTValueMatcher,
-    EditDistanceValueMatcher,
-    EmbeddingValueMatcher,
-    AutoFuzzyJoinValueMatcher,
-    FastTextValueMatcher,
-)
-from bdikit.mapping_algorithms.value_mapping.value_mappers import (
+
+from bdikit.schema_matching.best.base import BaseSchemaMatcher
+from bdikit.schema_matching.best.matcher_factory import SchemaMatchers
+from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
+from bdikit.schema_matching.topk.matcher_factory import TopkMatchers
+from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult
+from bdikit.value_matching.matcher_factory import ValueMatchers
+
+from bdikit.mapping_functions import (
     ValueMapper,
     FunctionValueMapper,
     DictionaryMapper,
@@ -67,37 +42,6 @@
 logger = logging.getLogger(__name__)
 
 
-class SchemaMatchers(Enum):
-    SIMFLOOD = ("similarity_flooding", SimFloodSchemaMatcher)
-    COMA = ("coma", ComaSchemaMatcher)
-    CUPID = ("cupid", CupidSchemaMatcher)
-    DISTRIBUTION_BASED = ("distribution_based", DistributionBasedSchemaMatcher)
-    JACCARD_DISTANCE = ("jaccard_distance", JaccardSchemaMatcher)
-    GPT = ("gpt", GPTSchemaMatcher)
-    CT_LEARGNING = ("ct_learning", ContrastiveLearningSchemaMatcher)
-    TWO_PHASE = ("two_phase", TwoPhaseSchemaMatcher)
-    MAX_VAL_SIM = ("max_val_sim", MaxValSimSchemaMatcher)
-
-    def __init__(self, method_name: str, method_class: Type[BaseSchemaMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
-
-    @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
-    ) -> BaseSchemaMatcher:
-        methods = {method.method_name: method.method_class for method in SchemaMatchers}
-
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
-            raise ValueError(
-                f"The {method_name} algorithm is not supported. "
-                f"Supported algorithms are: {names}"
-            )
-
-
 def match_schema(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame] = "gdc",
@@ -154,34 +98,12 @@ def _load_table_for_standard(name: str) -> pd.DataFrame:
         raise ValueError(f"The {name} standard is not supported")
 
 
-class TopkMatchers(Enum):
-    CT_LEARNING = ("ct_learning", CLTopkColumnMatcher)
-
-    def __init__(self, method_name: str, method_class: Type[TopkColumnMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
-
-    @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
-    ) -> TopkColumnMatcher:
-        methods = {method.method_name: method.method_class for method in TopkMatchers}
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
-            raise ValueError(
-                f"The {method_name} algorithm is not supported. "
-                f"Supported algorithms are: {names}"
-            )
-
-
 def top_matches(
     source: pd.DataFrame,
     columns: Optional[List[str]] = None,
     target: Union[str, pd.DataFrame] = "gdc",
     top_k: int = 10,
-    method: Union[str, TopkColumnMatcher] = "ct_learning",
+    method: Union[str, BaseTopkSchemaMatcher] = "ct_learning",
     method_args: Optional[Dict[str, Any]] = None,
 ) -> pd.DataFrame:
     """
@@ -211,11 +133,11 @@ def top_matches(
         if method_args is None:
             method_args = {}
         topk_matcher = TopkMatchers.get_instance(method, **method_args)
-    elif isinstance(method, TopkColumnMatcher):
+    elif isinstance(method, BaseTopkSchemaMatcher):
         topk_matcher = method
     else:
         raise ValueError(
-            "The method must be a string or an instance of TopkColumnMatcher"
+            "The method must be a string or an instance of BaseTopkColumnMatcher"
         )
 
     top_k_matches = topk_matcher.get_recommendations(
@@ -232,47 +154,11 @@ def top_matches(
     return pd.concat(dfs, ignore_index=True)
 
 
-class ValueMatchers(Enum):
-    TFIDF = ("tfidf", TFIDFValueMatcher)
-    EDIT = ("edit_distance", EditDistanceValueMatcher)
-    EMBEDDINGS = ("embedding", EmbeddingValueMatcher)
-    AUTOFJ = ("auto_fuzzy_join", AutoFuzzyJoinValueMatcher)
-    FASTTEXT = ("fasttext", FastTextValueMatcher)
-    GPT = ("gpt", GPTValueMatcher)
-
-    def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
-
-    @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
-    ) -> BaseValueMatcher:
-        methods = {method.method_name: method.method_class for method in ValueMatchers}
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
-            raise ValueError(
-                f"The {method_name} algorithm is not supported. "
-                f"Supported algorithms are: {names}"
-            )
-
-
-class ValueMatchingResult(TypedDict):
-    source: str
-    target: str
-    matches: List[ValueMatch]
-    coverage: float
-    unique_values: Set[str]
-    unmatch_values: Set[str]
-
-
 def match_values(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame],
     column_mapping: Union[Tuple[str, str], pd.DataFrame],
-    method: str = DEFAULT_VALUE_MATCHING_METHOD,
+    method: Union[str, BaseValueMatcher] = DEFAULT_VALUE_MATCHING_METHOD,
     method_args: Optional[Dict[str, Any]] = None,
 ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
     """