VIDA-NYU · roquelopez · Jul 26, 2024 · Nov 11, 2024 · Nov 12, 2024 · Nov 19, 2024
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,50 @@
+Contributing to bdi-kit
+=======================
+
+There are many ways to contribute to bdi-kit, such as improving the codebase, reporting 
+issues or bugs, enhancing the documentation, reviewing pull requests from other developers, 
+adding new matching methods, or expanding support for additional standards. 
+See the instructions below to get started!
+
+
+Formatting the Code
+-------------------
+
+We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter.
+The CI runs for every pull request and will fail if code is not properly formatted.
+To make sure formatting is correct, you can do the following steps.
+
+Make sure you have black installed:
+```
+pip install black
+```
+
+To format the code, anyone can use the command before committing your changes:
+```
+make format
+```
+
+Or you can use the black command directly:
+```
+black ./bdikit/
+```
+
+
+Adding New Matching Methods
+---------------------------
+
+Contributors can add new methods for schema and value matching by following these steps:
+
+1. Create a Python module inside the "task folder" folder (e.g., `bdikit/value_matching`).
+
+2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching).
+
+3. Add a new entry in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Make sure to add the correct import path for your 
+module to ensure it can be accessed without errors.
+
+
+Code of Conduct
+---------------
+
+We abide by the principles of openness, respect, and consideration of others
+of the Python Software Foundation: https://www.python.org/psf/codeofconduct/.
diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ The `bdi-kit` is a library that assist users in performing data harmonization. I
 
 **Warning:** `bdi-kit` is currently in *alpha* stage and under heavy development. Expect APIs to change.
 
+
 ## Documentation
 
 Documentation is available at [https://bdi-kit.readthedocs.io/](https://bdi-kit.readthedocs.io/).
@@ -36,21 +37,4 @@ pip install git+https://github.com/VIDA-NYU/bdi-kit@devel
 
 ## Contributing
 
-We format code using the [black](https://black.readthedocs.io/en/stable/) code formatter.
-The CI runs for every pull request and will fail if code is not properly formatted.
-To make sure formatting is correct, you can do the following steps.
-
-Make sure you have black installed:
-```
-pip install black
-```
-
-To format the code, anyone can use the command before committing your changes:
-```
-make format
-```
-
-Or you can use the black command directly:
-```
-black ./bdikit/
-```
+To learn more about making a contribution to bdi-kit, please see our [Contributing guide](./CONTRIBUTING.md).
diff --git a/bdikit/api.py b/bdikit/api.py
@@ -1,19 +1,16 @@
 from __future__ import annotations
 import logging
-from enum import Enum
+
 from collections import defaultdict
 from os.path import join, dirname
 from typing import (
     Union,
-    Type,
     List,
     Dict,
     TypedDict,
-    Set,
     Optional,
     Tuple,
     Callable,
-    Mapping,
     Any,
 )
 import itertools
@@ -22,37 +19,15 @@
 import panel as pn
 from IPython.display import display, Markdown
 from bdikit.utils import get_gdc_data, get_gdc_metadata
-from bdikit.mapping_algorithms.column_mapping.algorithms import (
-    BaseSchemaMatcher,
-    SimFloodSchemaMatcher,
-    ComaSchemaMatcher,
-    CupidSchemaMatcher,
-    DistributionBasedSchemaMatcher,
-    JaccardSchemaMatcher,
-    GPTSchemaMatcher,
-    ContrastiveLearningSchemaMatcher,
-    TwoPhaseSchemaMatcher,
-    MaxValSimSchemaMatcher,
-)
-from bdikit.mapping_algorithms.value_mapping.value_mappers import ValueMapper
-from bdikit.models.contrastive_learning.cl_api import (
-    DEFAULT_CL_MODEL,
-)
-from bdikit.mapping_algorithms.column_mapping.topk_matchers import (
-    TopkColumnMatcher,
-    CLTopkColumnMatcher,
-)
-from bdikit.mapping_algorithms.value_mapping.algorithms import (
-    ValueMatch,
-    BaseValueMatcher,
-    TFIDFValueMatcher,
-    GPTValueMatcher,
-    EditDistanceValueMatcher,
-    EmbeddingValueMatcher,
-    AutoFuzzyJoinValueMatcher,
-    FastTextValueMatcher,
-)
-from bdikit.mapping_algorithms.value_mapping.value_mappers import (
+
+from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
+from bdikit.schema_matching.one2one.matcher_factory import SchemaMatchers
+from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
+from bdikit.schema_matching.topk.matcher_factory import TopkMatchers
+from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult
+from bdikit.value_matching.matcher_factory import ValueMatchers
+
+from bdikit.mapping_functions import (
     ValueMapper,
     FunctionValueMapper,
     DictionaryMapper,
@@ -67,37 +42,6 @@
 logger = logging.getLogger(__name__)
 
 
-class SchemaMatchers(Enum):
-    SIMFLOOD = ("similarity_flooding", SimFloodSchemaMatcher)
-    COMA = ("coma", ComaSchemaMatcher)
-    CUPID = ("cupid", CupidSchemaMatcher)
-    DISTRIBUTION_BASED = ("distribution_based", DistributionBasedSchemaMatcher)
-    JACCARD_DISTANCE = ("jaccard_distance", JaccardSchemaMatcher)
-    GPT = ("gpt", GPTSchemaMatcher)
-    CT_LEARGNING = ("ct_learning", ContrastiveLearningSchemaMatcher)
-    TWO_PHASE = ("two_phase", TwoPhaseSchemaMatcher)
-    MAX_VAL_SIM = ("max_val_sim", MaxValSimSchemaMatcher)
-
-    def __init__(self, method_name: str, method_class: Type[BaseSchemaMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
-
-    @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
-    ) -> BaseSchemaMatcher:
-        methods = {method.method_name: method.method_class for method in SchemaMatchers}
-
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
-            raise ValueError(
-                f"The {method_name} algorithm is not supported. "
-                f"Supported algorithms are: {names}"
-            )
-
-
 def match_schema(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame] = "gdc",
@@ -130,7 +74,7 @@ def match_schema(
     if isinstance(method, str):
         if method_args is None:
             method_args = {}
-        matcher_instance = SchemaMatchers.get_instance(method, **method_args)
+        matcher_instance = SchemaMatchers.get_matcher(method, **method_args)
     elif isinstance(method, BaseSchemaMatcher):
         matcher_instance = method
     else:
@@ -154,34 +98,12 @@ def _load_table_for_standard(name: str) -> pd.DataFrame:
         raise ValueError(f"The {name} standard is not supported")
 
 
-class TopkMatchers(Enum):
-    CT_LEARNING = ("ct_learning", CLTopkColumnMatcher)
-
-    def __init__(self, method_name: str, method_class: Type[TopkColumnMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
-
-    @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
-    ) -> TopkColumnMatcher:
-        methods = {method.method_name: method.method_class for method in TopkMatchers}
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
-            raise ValueError(
-                f"The {method_name} algorithm is not supported. "
-                f"Supported algorithms are: {names}"
-            )
-
-
 def top_matches(
     source: pd.DataFrame,
     columns: Optional[List[str]] = None,
     target: Union[str, pd.DataFrame] = "gdc",
     top_k: int = 10,
-    method: Union[str, TopkColumnMatcher] = "ct_learning",
+    method: Union[str, BaseTopkSchemaMatcher] = "ct_learning",
     method_args: Optional[Dict[str, Any]] = None,
 ) -> pd.DataFrame:
     """
@@ -210,12 +132,12 @@ def top_matches(
     if isinstance(method, str):
         if method_args is None:
             method_args = {}
-        topk_matcher = TopkMatchers.get_instance(method, **method_args)
-    elif isinstance(method, TopkColumnMatcher):
+        topk_matcher = TopkMatchers.get_matcher(method, **method_args)
+    elif isinstance(method, BaseTopkSchemaMatcher):
         topk_matcher = method
     else:
         raise ValueError(
-            "The method must be a string or an instance of TopkColumnMatcher"
+            "The method must be a string or an instance of BaseTopkColumnMatcher"
         )
 
     top_k_matches = topk_matcher.get_recommendations(
@@ -232,47 +154,11 @@ def top_matches(
     return pd.concat(dfs, ignore_index=True)
 
 
-class ValueMatchers(Enum):
-    TFIDF = ("tfidf", TFIDFValueMatcher)
-    EDIT = ("edit_distance", EditDistanceValueMatcher)
-    EMBEDDINGS = ("embedding", EmbeddingValueMatcher)
-    AUTOFJ = ("auto_fuzzy_join", AutoFuzzyJoinValueMatcher)
-    FASTTEXT = ("fasttext", FastTextValueMatcher)
-    GPT = ("gpt", GPTValueMatcher)
-
-    def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
-
-    @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
-    ) -> BaseValueMatcher:
-        methods = {method.method_name: method.method_class for method in ValueMatchers}
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
-            raise ValueError(
-                f"The {method_name} algorithm is not supported. "
-                f"Supported algorithms are: {names}"
-            )
-
-
-class ValueMatchingResult(TypedDict):
-    source: str
-    target: str
-    matches: List[ValueMatch]
-    coverage: float
-    unique_values: Set[str]
-    unmatch_values: Set[str]
-
-
 def match_values(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame],
     column_mapping: Union[Tuple[str, str], pd.DataFrame],
-    method: str = DEFAULT_VALUE_MATCHING_METHOD,
+    method: Union[str, BaseValueMatcher] = DEFAULT_VALUE_MATCHING_METHOD,
     method_args: Optional[Dict[str, Any]] = None,
 ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
     """
@@ -314,11 +200,11 @@ def match_values(
     if method_args is None:
         method_args = {}
 
-    if "top_n" in method_args and method_args["top_n"] > 1:
+    if "top_k" in method_args and method_args["top_k"] > 1:
         logger.warning(
-            f"Ignoring 'top_n' argument, use the 'top_value_matches()' method to get top-k value matches."
+            f"Ignoring 'top_k' argument, use the 'top_value_matches()' method to get top-k value matches."
         )
-        method_args["top_n"] = 1
+        method_args["top_k"] = 1
 
     matches = _match_values(source, target, column_mapping, method, method_args)
 
@@ -457,7 +343,7 @@ def _match_values(
     target_domain, column_mapping_list = _format_value_matching_input(
         source, target, column_mapping
     )
-    value_matcher = ValueMatchers.get_instance(method, **method_args)
+    value_matcher = ValueMatchers.get_matcher(method, **method_args)
     mapping_results: List[ValueMatchingResult] = []
 
     for mapping in column_mapping_list: