Skip to content

Commit

Permalink
refactor: Rename package 'best' to 'one2one'
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez committed Nov 12, 2024
1 parent 9b66320 commit 43a5a8b
Show file tree
Hide file tree
Showing 14 changed files with 32 additions and 32 deletions.
4 changes: 2 additions & 2 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
from IPython.display import display, Markdown
from bdikit.utils import get_gdc_data, get_gdc_metadata

from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.best.matcher_factory import SchemaMatchers
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
from bdikit.schema_matching.one2one.matcher_factory import SchemaMatchers
from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
from bdikit.schema_matching.topk.matcher_factory import TopkMatchers
from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas as pd
from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL
from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
from openai import OpenAI
from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher


class GPTSchemaMatcher(BaseSchemaMatcher):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,42 +1,42 @@
import importlib
from enum import Enum
from typing import Mapping, Any
from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher


class SchemaMatchers(Enum):
SIMFLOOD = (
"similarity_flooding",
"bdikit.schema_matching.best.valentine.SimFloodSchemaMatcher",
"bdikit.schema_matching.one2one.valentine.SimFloodSchemaMatcher",
)
COMA = (
"coma",
"bdikit.schema_matching.best.valentine.ComaSchemaMatcher",
"bdikit.schema_matching.one2one.valentine.ComaSchemaMatcher",
)
CUPID = (
"cupid",
"bdikit.schema_matching.best.valentine.CupidSchemaMatcher",
"bdikit.schema_matching.one2one.valentine.CupidSchemaMatcher",
)
DISTRIBUTION_BASED = (
"distribution_based",
"bdikit.schema_matching.best.valentine.DistributionBasedSchemaMatcher",
"bdikit.schema_matching.one2one.valentine.DistributionBasedSchemaMatcher",
)
JACCARD_DISTANCE = (
"jaccard_distance",
"bdikit.schema_matching.best.valentine.JaccardDistanceSchemaMatcher",
"bdikit.schema_matching.one2one.valentine.JaccardDistanceSchemaMatcher",
)
GPT = ("gpt", "bdikit.schema_matching.best.gpt.GPTSchemaMatcher")
GPT = ("gpt", "bdikit.schema_matching.one2one.gpt.GPTSchemaMatcher")
CT_LEARNING = (
"ct_learning",
"bdikit.schema_matching.best.contrastivelearning.ContrastiveLearningSchemaMatcher",
"bdikit.schema_matching.one2one.contrastivelearning.ContrastiveLearningSchemaMatcher",
)
TWO_PHASE = (
"two_phase",
"bdikit.schema_matching.best.twophase.TwoPhaseSchemaMatcher",
"bdikit.schema_matching.one2one.twophase.TwoPhaseSchemaMatcher",
)
MAX_VAL_SIM = (
"max_val_sim",
"bdikit.schema_matching.best.maxvalsim.MaxValSimSchemaMatcher",
"bdikit.schema_matching.one2one.maxvalsim.MaxValSimSchemaMatcher",
)

def __init__(self, matcher_name: str, matcher_path: str):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
from typing import Optional
from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL
from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
from typing import Optional
from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.best.valentine import SimFloodSchemaMatcher
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
from bdikit.schema_matching.one2one.valentine import SimFloodSchemaMatcher
from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL
from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
from typing import Dict, Callable
from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.one2one.base import BaseSchemaMatcher
from valentine import valentine_match
from valentine.algorithms.matcher_results import MatcherResults
from valentine.algorithms.jaccard_distance import StringDistanceFunction
Expand Down
2 changes: 1 addition & 1 deletion bdikit/value_matching/polyfuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def match(
target = top_matches[index]
similarity = top_matches[index + 1]
if similarity >= self.threshold:
matches.append((source, target, similarity))
matches.append(ValueMatch(source, target, similarity))

return matches

Expand Down
2 changes: 1 addition & 1 deletion docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@

.. automodule:: bdikit.api
:members:
:exclude-members: SchemaMatchers, ValueMatchers, TopkMatchers, ValueMatchingResult, ColumnMappingSpec
:exclude-members: ColumnMappingSpec
16 changes: 8 additions & 8 deletions docs/source/schema-matching.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ To see how to use these methods, please refer to the documentation of :py:func:`
- Class
- Description
* - ``ct_learning``
- :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.ContrastiveLearningSchemaMatcher`
- :class:`~bdikit.schema_matching.one2one.contrastivelearning.ContrastiveLearningSchemaMatcher`
- | Uses a contrastive (CT) learning model to learn embeddings for columns and retrieves the best match most similar columns using the cosine similarity between the column embeddings.
* - ``two_phase``
- :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.TwoPhaseSchemaMatcher`
- :class:`~bdikit.schema_matching.one2one.twophase.TwoPhaseSchemaMatcher`
- | The two-phase schema matching method first uses a a top-k column matcher (e.g., `ct_learning`) to prune the search space (keeping only the top-k most likely matches), and then uses another column matcher to choose the best match from the pruned search space.
* - ``gpt``
- :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.GPTSchemaMatcher`
- :class:`~bdikit.schema_matching.one2one.gpt.GPTSchemaMatcher`
- | This method uses the `ct_learning` to prune the search space and then uses a large language model (GPT4) to choose the best column match, given a set of top-k most likely candidates retrieved using the `ct_learning` method in the first phase.

.. list-table:: Methods from other libraries
Expand All @@ -32,17 +32,17 @@ To see how to use these methods, please refer to the documentation of :py:func:`
- Class
- Description
* - ``similarity_flooding``
- :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.SimFloodSchemaMatcher`
- :class:`~bdikit.schema_matching.one2one.valentine.SimFloodSchemaMatcher`
- | Similarity Flooding transforms schemas into directed graphs and merges them into a propagation graph. The algorithm iteratively propagates similarity scores to neighboring nodes until convergence. This algorithm was proposed by Sergey Melnik, Hector Garcia-Molina, and Erhard Rahm in "Similarity Flooding: A Versatile Graph Matching Algorithm and Its Application to Schema Matching" (ICDE, 2002).
* - ``coma``
- :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.ComaSchemaMatcher`
- :class:`~bdikit.schema_matching.one2one.valentine.ComaSchemaMatcher`
- | COMA is a matcher that combines multiple schema-based matchers, representing schemas as rooted directed acyclic graphs. This algorithm was proposed by Do, Hong-Hai, and Erhard Rahm in "COMA — a system for flexible combination of schema matching approaches." (VLDB 2002). *This algorithm requires Java to be installed on the system.*
* - ``cupid``
- :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.CupidSchemaMatcher`
- :class:`~bdikit.schema_matching.one2one.valentine.CupidSchemaMatcher`
- | Cupid is a schema-based approach that translates schemas into tree structures. It calculates overall similarity using linguistic and structural similarities, with tree transformations helping to compute context-based similarity. This algorithm was proposed by Madhavan et al. in "Generic Schema Matching with Cupid" (VLDB, 2001)​.
* - ``distribution_based``
- :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.DistributionBasedSchemaMatcher`
- :class:`~bdikit.schema_matching.one2one.valentine.DistributionBasedSchemaMatcher`
- | Distribution-based Matching compares the distribution of data values in columns using the Earth Mover's Distance. It clusters relational attributes based on these comparisons. This algorithm was proposed by Zhang et al. in "Automatic discovery of attributes in relational databases" (SIGMOD 2011).
* - ``jaccard_distance``
- :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.JaccardSchemaMatcher`
- :class:`~bdikit.schema_matching.one2one.valentine.JaccardSchemaMatcher`
- | This algorithm computes pairwise column similarities using Jaccard similarity, treating values as identical if their Levenshtein distance is below a threshold. The algorithm was proposed by Koutras et al. in "Valentine: Evaluating matching techniques for dataset discovery" (ICDE 2021).
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ matplotlib<3.9
panel!=1.4.3
Levenshtein
autofj
nltk>=3.9.1
nltk>=3.9.1
6 changes: 3 additions & 3 deletions tests/test_schema_matching.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import pandas as pd
from bdikit.schema_matching.best.valentine import (
from bdikit.schema_matching.one2one.valentine import (
SimFloodSchemaMatcher,
JaccardSchemaMatcher,
DistributionBasedSchemaMatcher,
ComaSchemaMatcher,
CupidSchemaMatcher,
)
from bdikit.schema_matching.best.twophase import TwoPhaseSchemaMatcher
from bdikit.schema_matching.best.contrastivelearning import ContrastiveLearningSchemaMatcher
from bdikit.schema_matching.one2one.twophase import TwoPhaseSchemaMatcher
from bdikit.schema_matching.one2one.contrastivelearning import ContrastiveLearningSchemaMatcher


def test_basic_column_mapping_algorithms():
Expand Down

0 comments on commit 43a5a8b

Please sign in to comment.