Skip to content

Commit

Permalink
feat: Restructure packages to streamline the addition of new algorithms
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez committed Oct 21, 2024
1 parent b6386b6 commit c82c887
Show file tree
Hide file tree
Showing 31 changed files with 753 additions and 736 deletions.
26 changes: 26 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
Contributing to bdi-kit
=======================

There are many ways to contribute to bdi-kit, such as improving the codebase, reporting
issues or bugs, enhancing the documentation, reviewing pull requests from other developers,
adding new matching methods, or expanding support for additional standards.
See the instructions below to get started!


Adding New Matching Methods
---------------------------

Contributors can add new methods for schema and value matching by following these steps:

1. Create a Python module inside the `algorithms` folder (e.g., `bdikit/value_matching/algorithms`).

2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching).

3. Instantiate an object of your class in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Ensure your module is properly imported in the `__init__.py` file (e.g.,` bdikit/value_matching/__init__.py`).


Code of Conduct
---------------

We abide by the principles of openness, respect, and consideration of others
of the Python Software Foundation: https://www.python.org/psf/codeofconduct/.
142 changes: 14 additions & 128 deletions bdikit/api.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
from __future__ import annotations
import logging
from enum import Enum

from collections import defaultdict
from os.path import join, dirname
from typing import (
Union,
Type,
List,
Dict,
TypedDict,
Set,
Optional,
Tuple,
Callable,
Mapping,
Any,
)
import itertools
Expand All @@ -22,37 +19,15 @@
import panel as pn
from IPython.display import display, Markdown
from bdikit.utils import get_gdc_data, get_gdc_metadata
from bdikit.mapping_algorithms.column_mapping.algorithms import (
BaseSchemaMatcher,
SimFloodSchemaMatcher,
ComaSchemaMatcher,
CupidSchemaMatcher,
DistributionBasedSchemaMatcher,
JaccardSchemaMatcher,
GPTSchemaMatcher,
ContrastiveLearningSchemaMatcher,
TwoPhaseSchemaMatcher,
MaxValSimSchemaMatcher,
)
from bdikit.mapping_algorithms.value_mapping.value_mappers import ValueMapper
from bdikit.models.contrastive_learning.cl_api import (
DEFAULT_CL_MODEL,
)
from bdikit.mapping_algorithms.column_mapping.topk_matchers import (
TopkColumnMatcher,
CLTopkColumnMatcher,
)
from bdikit.mapping_algorithms.value_mapping.algorithms import (
ValueMatch,
BaseValueMatcher,
TFIDFValueMatcher,
GPTValueMatcher,
EditDistanceValueMatcher,
EmbeddingValueMatcher,
AutoFuzzyJoinValueMatcher,
FastTextValueMatcher,
)
from bdikit.mapping_algorithms.value_mapping.value_mappers import (

from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.best.matcher_factory import SchemaMatchers
from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
from bdikit.schema_matching.topk.matcher_factory import TopkMatchers
from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult
from bdikit.value_matching.matcher_factory import ValueMatchers

from bdikit.mapping_functions import (
ValueMapper,
FunctionValueMapper,
DictionaryMapper,
Expand All @@ -67,37 +42,6 @@
logger = logging.getLogger(__name__)


class SchemaMatchers(Enum):
SIMFLOOD = ("similarity_flooding", SimFloodSchemaMatcher)
COMA = ("coma", ComaSchemaMatcher)
CUPID = ("cupid", CupidSchemaMatcher)
DISTRIBUTION_BASED = ("distribution_based", DistributionBasedSchemaMatcher)
JACCARD_DISTANCE = ("jaccard_distance", JaccardSchemaMatcher)
GPT = ("gpt", GPTSchemaMatcher)
CT_LEARGNING = ("ct_learning", ContrastiveLearningSchemaMatcher)
TWO_PHASE = ("two_phase", TwoPhaseSchemaMatcher)
MAX_VAL_SIM = ("max_val_sim", MaxValSimSchemaMatcher)

def __init__(self, method_name: str, method_class: Type[BaseSchemaMatcher]):
self.method_name = method_name
self.method_class = method_class

@staticmethod
def get_instance(
method_name: str, **method_kwargs: Mapping[str, Any]
) -> BaseSchemaMatcher:
methods = {method.method_name: method.method_class for method in SchemaMatchers}

try:
return methods[method_name](**method_kwargs)
except KeyError:
names = ", ".join(list(methods.keys()))
raise ValueError(
f"The {method_name} algorithm is not supported. "
f"Supported algorithms are: {names}"
)


def match_schema(
source: pd.DataFrame,
target: Union[str, pd.DataFrame] = "gdc",
Expand Down Expand Up @@ -154,34 +98,12 @@ def _load_table_for_standard(name: str) -> pd.DataFrame:
raise ValueError(f"The {name} standard is not supported")


class TopkMatchers(Enum):
CT_LEARNING = ("ct_learning", CLTopkColumnMatcher)

def __init__(self, method_name: str, method_class: Type[TopkColumnMatcher]):
self.method_name = method_name
self.method_class = method_class

@staticmethod
def get_instance(
method_name: str, **method_kwargs: Mapping[str, Any]
) -> TopkColumnMatcher:
methods = {method.method_name: method.method_class for method in TopkMatchers}
try:
return methods[method_name](**method_kwargs)
except KeyError:
names = ", ".join(list(methods.keys()))
raise ValueError(
f"The {method_name} algorithm is not supported. "
f"Supported algorithms are: {names}"
)


def top_matches(
source: pd.DataFrame,
columns: Optional[List[str]] = None,
target: Union[str, pd.DataFrame] = "gdc",
top_k: int = 10,
method: Union[str, TopkColumnMatcher] = "ct_learning",
method: Union[str, BaseTopkSchemaMatcher] = "ct_learning",
method_args: Optional[Dict[str, Any]] = None,
) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -211,11 +133,11 @@ def top_matches(
if method_args is None:
method_args = {}
topk_matcher = TopkMatchers.get_instance(method, **method_args)
elif isinstance(method, TopkColumnMatcher):
elif isinstance(method, BaseTopkSchemaMatcher):
topk_matcher = method
else:
raise ValueError(
"The method must be a string or an instance of TopkColumnMatcher"
"The method must be a string or an instance of BaseTopkColumnMatcher"
)

top_k_matches = topk_matcher.get_recommendations(
Expand All @@ -232,47 +154,11 @@ def top_matches(
return pd.concat(dfs, ignore_index=True)


class ValueMatchers(Enum):
TFIDF = ("tfidf", TFIDFValueMatcher)
EDIT = ("edit_distance", EditDistanceValueMatcher)
EMBEDDINGS = ("embedding", EmbeddingValueMatcher)
AUTOFJ = ("auto_fuzzy_join", AutoFuzzyJoinValueMatcher)
FASTTEXT = ("fasttext", FastTextValueMatcher)
GPT = ("gpt", GPTValueMatcher)

def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]):
self.method_name = method_name
self.method_class = method_class

@staticmethod
def get_instance(
method_name: str, **method_kwargs: Mapping[str, Any]
) -> BaseValueMatcher:
methods = {method.method_name: method.method_class for method in ValueMatchers}
try:
return methods[method_name](**method_kwargs)
except KeyError:
names = ", ".join(list(methods.keys()))
raise ValueError(
f"The {method_name} algorithm is not supported. "
f"Supported algorithms are: {names}"
)


class ValueMatchingResult(TypedDict):
source: str
target: str
matches: List[ValueMatch]
coverage: float
unique_values: Set[str]
unmatch_values: Set[str]


def match_values(
source: pd.DataFrame,
target: Union[str, pd.DataFrame],
column_mapping: Union[Tuple[str, str], pd.DataFrame],
method: str = DEFAULT_VALUE_MATCHING_METHOD,
method: Union[str, BaseValueMatcher] = DEFAULT_VALUE_MATCHING_METHOD,
method_args: Optional[Dict[str, Any]] = None,
) -> Union[pd.DataFrame, List[pd.DataFrame]]:
"""
Expand Down
Loading

0 comments on commit c82c887

Please sign in to comment.