From 04a390f9bcec19ba380ae8426564605daceda25d Mon Sep 17 00:00:00 2001 From: Roque Lopez Date: Mon, 11 Nov 2024 16:33:22 -0500 Subject: [PATCH] refactor: Rename package 'best' to 'one2one' --- bdikit/api.py | 4 ++-- .../{best => one2one}/__init__.py | 0 .../schema_matching/{best => one2one}/base.py | 0 .../{best => one2one}/contrastivelearning.py | 2 +- .../schema_matching/{best => one2one}/gpt.py | 2 +- .../{best => one2one}/matcher_factory.py | 20 +++++++++---------- .../{best => one2one}/maxvalsim.py | 2 +- .../{best => one2one}/twophase.py | 4 ++-- .../{best => one2one}/valentine.py | 2 +- docs/source/schema-matching.rst | 16 +++++++-------- requirements.txt | 2 -- tests/test_schema_matching.py | 6 +++--- 12 files changed, 29 insertions(+), 31 deletions(-) rename bdikit/schema_matching/{best => one2one}/__init__.py (100%) rename bdikit/schema_matching/{best => one2one}/base.py (100%) rename bdikit/schema_matching/{best => one2one}/contrastivelearning.py (92%) rename bdikit/schema_matching/{best => one2one}/gpt.py (96%) rename bdikit/schema_matching/{best => one2one}/matcher_factory.py (64%) rename bdikit/schema_matching/{best => one2one}/maxvalsim.py (97%) rename bdikit/schema_matching/{best => one2one}/twophase.py (92%) rename bdikit/schema_matching/{best => one2one}/valentine.py (97%) diff --git a/bdikit/api.py b/bdikit/api.py index 747a4c40..44c99a48 100644 --- a/bdikit/api.py +++ b/bdikit/api.py @@ -20,8 +20,8 @@ from IPython.display import display, Markdown from bdikit.utils import get_gdc_data, get_gdc_metadata -from bdikit.schema_matching.best.base import BaseSchemaMatcher -from bdikit.schema_matching.best.matcher_factory import SchemaMatchers +from bdikit.schema_matching.one2one.base import BaseSchemaMatcher +from bdikit.schema_matching.one2one.matcher_factory import SchemaMatchers from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher from bdikit.schema_matching.topk.matcher_factory import TopkMatchers from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult diff --git a/bdikit/schema_matching/best/__init__.py b/bdikit/schema_matching/one2one/__init__.py similarity index 100% rename from bdikit/schema_matching/best/__init__.py rename to bdikit/schema_matching/one2one/__init__.py diff --git a/bdikit/schema_matching/best/base.py b/bdikit/schema_matching/one2one/base.py similarity index 100% rename from bdikit/schema_matching/best/base.py rename to bdikit/schema_matching/one2one/base.py diff --git a/bdikit/schema_matching/best/contrastivelearning.py b/bdikit/schema_matching/one2one/contrastivelearning.py similarity index 92% rename from bdikit/schema_matching/best/contrastivelearning.py rename to bdikit/schema_matching/one2one/contrastivelearning.py index d0c172ac..57688299 100644 --- a/bdikit/schema_matching/best/contrastivelearning.py +++ b/bdikit/schema_matching/one2one/contrastivelearning.py @@ -1,5 +1,5 @@ import pandas as pd -from bdikit.schema_matching.best.base import BaseSchemaMatcher +from bdikit.schema_matching.one2one.base import BaseSchemaMatcher from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher diff --git a/bdikit/schema_matching/best/gpt.py b/bdikit/schema_matching/one2one/gpt.py similarity index 96% rename from bdikit/schema_matching/best/gpt.py rename to bdikit/schema_matching/one2one/gpt.py index 68d27803..8ed2e07c 100644 --- a/bdikit/schema_matching/best/gpt.py +++ b/bdikit/schema_matching/one2one/gpt.py @@ -1,6 +1,6 @@ import pandas as pd from openai import OpenAI -from bdikit.schema_matching.best.base import BaseSchemaMatcher +from bdikit.schema_matching.one2one.base import BaseSchemaMatcher class GPTSchemaMatcher(BaseSchemaMatcher): diff --git a/bdikit/schema_matching/best/matcher_factory.py b/bdikit/schema_matching/one2one/matcher_factory.py similarity index 64% rename from bdikit/schema_matching/best/matcher_factory.py rename to bdikit/schema_matching/one2one/matcher_factory.py index b292245f..5a52c17a 100644 --- a/bdikit/schema_matching/best/matcher_factory.py +++ b/bdikit/schema_matching/one2one/matcher_factory.py @@ -1,42 +1,42 @@ import importlib from enum import Enum from typing import Mapping, Any -from bdikit.schema_matching.best.base import BaseSchemaMatcher +from bdikit.schema_matching.one2one.base import BaseSchemaMatcher class SchemaMatchers(Enum): SIMFLOOD = ( "similarity_flooding", - "bdikit.schema_matching.best.valentine.SimFloodSchemaMatcher", + "bdikit.schema_matching.one2one.valentine.SimFloodSchemaMatcher", ) COMA = ( "coma", - "bdikit.schema_matching.best.valentine.ComaSchemaMatcher", + "bdikit.schema_matching.one2one.valentine.ComaSchemaMatcher", ) CUPID = ( "cupid", - "bdikit.schema_matching.best.valentine.CupidSchemaMatcher", + "bdikit.schema_matching.one2one.valentine.CupidSchemaMatcher", ) DISTRIBUTION_BASED = ( "distribution_based", - "bdikit.schema_matching.best.valentine.DistributionBasedSchemaMatcher", + "bdikit.schema_matching.one2one.valentine.DistributionBasedSchemaMatcher", ) JACCARD_DISTANCE = ( "jaccard_distance", - "bdikit.schema_matching.best.valentine.JaccardDistanceSchemaMatcher", + "bdikit.schema_matching.one2one.valentine.JaccardDistanceSchemaMatcher", ) - GPT = ("gpt", "bdikit.schema_matching.best.gpt.GPTSchemaMatcher") + GPT = ("gpt", "bdikit.schema_matching.one2one.gpt.GPTSchemaMatcher") CT_LEARNING = ( "ct_learning", - "bdikit.schema_matching.best.contrastivelearning.ContrastiveLearningSchemaMatcher", + "bdikit.schema_matching.one2one.contrastivelearning.ContrastiveLearningSchemaMatcher", ) TWO_PHASE = ( "two_phase", - "bdikit.schema_matching.best.twophase.TwoPhaseSchemaMatcher", + "bdikit.schema_matching.one2one.twophase.TwoPhaseSchemaMatcher", ) MAX_VAL_SIM = ( "max_val_sim", - "bdikit.schema_matching.best.maxvalsim.MaxValSimSchemaMatcher", + "bdikit.schema_matching.one2one.maxvalsim.MaxValSimSchemaMatcher", ) def __init__(self, matcher_name: str, matcher_path: str): diff --git a/bdikit/schema_matching/best/maxvalsim.py b/bdikit/schema_matching/one2one/maxvalsim.py similarity index 97% rename from bdikit/schema_matching/best/maxvalsim.py rename to bdikit/schema_matching/one2one/maxvalsim.py index 1fc2dea8..87c82ad0 100644 --- a/bdikit/schema_matching/best/maxvalsim.py +++ b/bdikit/schema_matching/one2one/maxvalsim.py @@ -1,6 +1,6 @@ import pandas as pd from typing import Optional -from bdikit.schema_matching.best.base import BaseSchemaMatcher +from bdikit.schema_matching.one2one.base import BaseSchemaMatcher from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher diff --git a/bdikit/schema_matching/best/twophase.py b/bdikit/schema_matching/one2one/twophase.py similarity index 92% rename from bdikit/schema_matching/best/twophase.py rename to bdikit/schema_matching/one2one/twophase.py index 815d4f3e..7ceae44e 100644 --- a/bdikit/schema_matching/best/twophase.py +++ b/bdikit/schema_matching/one2one/twophase.py @@ -1,7 +1,7 @@ import pandas as pd from typing import Optional -from bdikit.schema_matching.best.base import BaseSchemaMatcher -from bdikit.schema_matching.best.valentine import SimFloodSchemaMatcher +from bdikit.schema_matching.one2one.base import BaseSchemaMatcher +from bdikit.schema_matching.one2one.valentine import SimFloodSchemaMatcher from bdikit.models.contrastive_learning.cl_api import DEFAULT_CL_MODEL from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher from bdikit.schema_matching.topk.contrastivelearning import CLTopkSchemaMatcher diff --git a/bdikit/schema_matching/best/valentine.py b/bdikit/schema_matching/one2one/valentine.py similarity index 97% rename from bdikit/schema_matching/best/valentine.py rename to bdikit/schema_matching/one2one/valentine.py index 67f8fc25..82c22021 100644 --- a/bdikit/schema_matching/best/valentine.py +++ b/bdikit/schema_matching/one2one/valentine.py @@ -1,6 +1,6 @@ import pandas as pd from typing import Dict, Callable -from bdikit.schema_matching.best.base import BaseSchemaMatcher +from bdikit.schema_matching.one2one.base import BaseSchemaMatcher from valentine import valentine_match from valentine.algorithms.matcher_results import MatcherResults from valentine.algorithms.jaccard_distance import StringDistanceFunction diff --git a/docs/source/schema-matching.rst b/docs/source/schema-matching.rst index c09b9a93..71482522 100644 --- a/docs/source/schema-matching.rst +++ b/docs/source/schema-matching.rst @@ -16,13 +16,13 @@ To see how to use these methods, please refer to the documentation of :py:func:` - Class - Description * - ``ct_learning`` - - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.ContrastiveLearningSchemaMatcher` + - :class:`~bdikit.schema_matching.one2one.contrastivelearning.ContrastiveLearningSchemaMatcher` - | Uses a contrastive (CT) learning model to learn embeddings for columns and retrieves the best match most similar columns using the cosine similarity between the column embeddings. * - ``two_phase`` - - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.TwoPhaseSchemaMatcher` + - :class:`~bdikit.schema_matching.one2one.twophase.TwoPhaseSchemaMatcher` - | The two-phase schema matching method first uses a a top-k column matcher (e.g., `ct_learning`) to prune the search space (keeping only the top-k most likely matches), and then uses another column matcher to choose the best match from the pruned search space. * - ``gpt`` - - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.GPTSchemaMatcher` + - :class:`~bdikit.schema_matching.one2one.gpt.GPTSchemaMatcher` - | This method uses the `ct_learning` to prune the search space and then uses a large language model (GPT4) to choose the best column match, given a set of top-k most likely candidates retrieved using the `ct_learning` method in the first phase. .. list-table:: Methods from other libraries @@ -32,17 +32,17 @@ To see how to use these methods, please refer to the documentation of :py:func:` - Class - Description * - ``similarity_flooding`` - - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.SimFloodSchemaMatcher` + - :class:`~bdikit.schema_matching.one2one.valentine.SimFloodSchemaMatcher` - | Similarity Flooding transforms schemas into directed graphs and merges them into a propagation graph. The algorithm iteratively propagates similarity scores to neighboring nodes until convergence. This algorithm was proposed by Sergey Melnik, Hector Garcia-Molina, and Erhard Rahm in "Similarity Flooding: A Versatile Graph Matching Algorithm and Its Application to Schema Matching" (ICDE, 2002). * - ``coma`` - - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.ComaSchemaMatcher` + - :class:`~bdikit.schema_matching.one2one.valentine.ComaSchemaMatcher` - | COMA is a matcher that combines multiple schema-based matchers, representing schemas as rooted directed acyclic graphs. This algorithm was proposed by Do, Hong-Hai, and Erhard Rahm in "COMA — a system for flexible combination of schema matching approaches." (VLDB 2002). *This algorithm requires Java to be installed on the system.* * - ``cupid`` - - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.CupidSchemaMatcher` + - :class:`~bdikit.schema_matching.one2one.valentine.CupidSchemaMatcher` - | Cupid is a schema-based approach that translates schemas into tree structures. It calculates overall similarity using linguistic and structural similarities, with tree transformations helping to compute context-based similarity. This algorithm was proposed by Madhavan et al. in "Generic Schema Matching with Cupid" (VLDB, 2001)​. * - ``distribution_based`` - - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.DistributionBasedSchemaMatcher` + - :class:`~bdikit.schema_matching.one2one.valentine.DistributionBasedSchemaMatcher` - | Distribution-based Matching compares the distribution of data values in columns using the Earth Mover's Distance. It clusters relational attributes based on these comparisons. This algorithm was proposed by Zhang et al. in "Automatic discovery of attributes in relational databases" (SIGMOD 2011). * - ``jaccard_distance`` - - :class:`~bdikit.mapping_algorithms.column_mapping.algorithms.JaccardSchemaMatcher` + - :class:`~bdikit.schema_matching.one2one.valentine.JaccardSchemaMatcher` - | This algorithm computes pairwise column similarities using Jaccard similarity, treating values as identical if their Levenshtein distance is below a threshold. The algorithm was proposed by Koutras et al. in "Valentine: Evaluating matching techniques for dataset discovery" (ICDE 2021). diff --git a/requirements.txt b/requirements.txt index 30524b31..bd314527 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,5 @@ requests scipy<1.13 matplotlib<3.9 panel!=1.4.3 -Levenshtein -autofj conllu<5.0.0 nltk<3.8.2 diff --git a/tests/test_schema_matching.py b/tests/test_schema_matching.py index 698beeb1..e2ce0e07 100644 --- a/tests/test_schema_matching.py +++ b/tests/test_schema_matching.py @@ -1,13 +1,13 @@ import pandas as pd -from bdikit.schema_matching.best.valentine import ( +from bdikit.schema_matching.one2one.valentine import ( SimFloodSchemaMatcher, JaccardSchemaMatcher, DistributionBasedSchemaMatcher, ComaSchemaMatcher, CupidSchemaMatcher, ) -from bdikit.schema_matching.best.twophase import TwoPhaseSchemaMatcher -from bdikit.schema_matching.best.contrastivelearning import ContrastiveLearningSchemaMatcher +from bdikit.schema_matching.one2one.twophase import TwoPhaseSchemaMatcher +from bdikit.schema_matching.one2one.contrastivelearning import ContrastiveLearningSchemaMatcher def test_basic_column_mapping_algorithms():