diff --git a/bdikit/__init__.py b/bdikit/__init__.py index b99a3f57..79d0deeb 100644 --- a/bdikit/__init__.py +++ b/bdikit/__init__.py @@ -1,3 +1,3 @@ -__version__ = '0.2.0.dev0' +__version__ = "0.2.0.dev0" # To shortcut the import path -from bdikit.api import APIManager \ No newline at end of file +from bdikit.api import APIManager diff --git a/bdikit/api.py b/bdikit/api.py index c7621a24..70a453af 100644 --- a/bdikit/api.py +++ b/bdikit/api.py @@ -2,19 +2,24 @@ from bdikit.mapping_recommendation.scope_reducing_manager import ScopeReducingManager from bdikit.mapping_recommendation.value_mapping_manager import ValueMappingManager from bdikit.mapping_recommendation.column_mapping_manager import ColumnMappingManager -from bdikit.visualization.mappings import plot_reduce_scope, plot_column_mappings, plot_value_mappings +from bdikit.visualization.mappings import ( + plot_reduce_scope, + plot_column_mappings, + plot_value_mappings, +) from bdikit.utils import get_gdc_data from os.path import join, dirname import os -os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages +os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable huggingface messages -GDC_DATA_PATH = join(dirname(__file__), './resource/gdc_table.csv') +GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv") -class APIManager(): - - def __init__(self,): +class APIManager: + def __init__( + self, + ): # TODO: move into database object (in data_ingestion folder) self.dataset = None # TODO: move into database object (in data_ingestion folder) @@ -23,8 +28,8 @@ def __init__(self,): self.reduced_scope = None self.column_manager = None self.value_manager = None - self.column_mappings = None # TODO move this to a property in column_manager - self.value_mappings = None # TODO move this to a property in value_manager + self.column_mappings = None # TODO move this to a property in column_manager + self.value_mappings = None # TODO move this to a property in value_manager def load_global_table(self, global_table_path=None): if global_table_path is None: @@ -47,27 +52,35 @@ def reduce_scope(self, num_columns=5, num_candidates=5): return self.reduced_scope - def map_columns(self, algorithm='SimFloodAlgorithm'): - self.column_manager = ColumnMappingManager(self.dataset, self.global_table, algorithm) + def map_columns(self, algorithm="SimFloodAlgorithm"): + self.column_manager = ColumnMappingManager( + self.dataset, self.global_table, algorithm + ) self.column_manager.reduced_scope = self.reduced_scope self.column_mappings = self.column_manager.map() plot_column_mappings(self.column_mappings) return self.column_mappings - def map_values(self, algorithm='EditAlgorithm'): + def map_values(self, algorithm="EditAlgorithm"): self.global_table_all = get_gdc_data(self.column_mappings.values()) - self.value_manager = ValueMappingManager(self.dataset, self.column_mappings, self.global_table_all, algorithm) + self.value_manager = ValueMappingManager( + self.dataset, self.column_mappings, self.global_table_all, algorithm + ) self.value_mappings = self.value_manager.map() plot_value_mappings(self.value_mappings) return self.value_mappings - def update_reduced_scope(self, original_column, new_candidate_name, new_candidate_sim=1.0): + def update_reduced_scope( + self, original_column, new_candidate_name, new_candidate_sim=1.0 + ): for index in range(len(self.reduced_scope)): - if self.reduced_scope[index]['Candidate column'] == original_column: - self.reduced_scope[index]['Top k columns'].append((new_candidate_name, new_candidate_sim)) - print('Reduced scope updated!') + if self.reduced_scope[index]["Candidate column"] == original_column: + self.reduced_scope[index]["Top k columns"].append( + (new_candidate_name, new_candidate_sim) + ) + print("Reduced scope updated!") plot_reduce_scope(self.reduced_scope) break @@ -75,13 +88,22 @@ def update_column_mappings(self, new_mappings): for original_column, new_target_column in new_mappings: self.column_mappings[original_column] = new_target_column - print('Column mapping updated!') + print("Column mapping updated!") plot_column_mappings(self.column_mappings) - def update_value_mappings(self, original_column, original_value, new_target_value, new_similarity=1.0): - for index in range(len(self.value_mappings[original_column]['matches'])): - if self.value_mappings[original_column]['matches'][index][0] == original_value: - self.value_mappings[original_column]['matches'][index] = (original_value, new_target_value, new_similarity) - print('Value mapping updated!') + def update_value_mappings( + self, original_column, original_value, new_target_value, new_similarity=1.0 + ): + for index in range(len(self.value_mappings[original_column]["matches"])): + if ( + self.value_mappings[original_column]["matches"][index][0] + == original_value + ): + self.value_mappings[original_column]["matches"][index] = ( + original_value, + new_target_value, + new_similarity, + ) + print("Value mapping updated!") plot_value_mappings(self.value_mappings) break diff --git a/bdikit/data_ingestion/column.py b/bdikit/data_ingestion/column.py index d79b3f73..9787678a 100644 --- a/bdikit/data_ingestion/column.py +++ b/bdikit/data_ingestion/column.py @@ -1,41 +1,43 @@ from enum import Enum + class ColumnType(Enum): - STRING = 'string' - FLOAT = 'float' - INTEGER = 'integer' - # TODO semantic types? + STRING = "string" + FLOAT = "float" + INTEGER = "integer" + # TODO semantic types? + class Column: - def __init__(self, df_name, column_name, column_type=ColumnType.STRING, domain_values=None, null_values_representations=None): + def __init__( + self, + df_name, + column_name, + column_type=ColumnType.STRING, + domain_values=None, + null_values_representations=None, + ): self.df_name = df_name self.column_name = column_name self.column_type = column_type - + if domain_values is None: self.domain_values = set() else: self.domain_values = set(domain_values) - + if null_values_representations is None: self.null_values_representations = set() else: self.null_values_representations = set(null_values_representations) - - def __str__(self): return f"Column(df_name={self.df_name}, column_name={self.column_name}, column_type={self.column_type}, domain_values={self.domain_values}, null_values_representations={self.null_values_representations})" - + def __eq__(self, value): if not isinstance(value, Column): return False return self.df_name == value.df_name and self.column_name == value.column_name - + def __hash__(self): return hash((self.df_name, self.column_name)) - - - - - diff --git a/bdikit/data_ingestion/database.py b/bdikit/data_ingestion/database.py index 701b9f41..1f8968d8 100644 --- a/bdikit/data_ingestion/database.py +++ b/bdikit/data_ingestion/database.py @@ -3,6 +3,7 @@ from .column import Column, ColumnType + class Database: """ A class representing a database that stores dataframes. @@ -14,7 +15,7 @@ class Database: load_data(df_name, file_path): Load data from a CSV file into a dataframe and store it in the database. load_data_from_folder(folder_path): Load data from all CSV files in a folder. get_dataframe(df_name): Retrieve a dataframe by its name. - get_dataframe_names(): Get the names of all dataframes stored in the database. + get_dataframe_names(): Get the names of all dataframes stored in the database. describe_database(): Print out the names, shape, columns, and head of all dataframes stored in the database. """ @@ -32,7 +33,8 @@ def load_data(self, df_name, file_path): """ if df_name in self.dataframes: raise ValueError( - f"Dataframe associated with file name '{df_name}' already exists in the database.") + f"Dataframe associated with file name '{df_name}' already exists in the database." + ) df = pd.read_csv(file_path) self.dataframes[df_name] = df @@ -42,7 +44,6 @@ def load_data(self, df_name, file_path): column = Column(df_name, c, ColumnType.STRING) self.columns.add(column) - def load_data_from_folder(self, folder_path): """ Function to load data from all CSV files in a folder using the Database class. @@ -76,7 +77,7 @@ def get_dataframe_names(self): list: A list of dataframe names. """ return list(self.dataframes.keys()) - + def get_columns(self): """ Get the names of all columns stored in the database. @@ -99,7 +100,6 @@ def describe_database(self): # print(f"\t\t- Head: \n{self.dataframes[df_name].head()}") - # def main(): # col1 = Column('df1', 'col1', ColumnType.STRING, ['a', 'b', 'c'], ['n/a', 'na']) # col2 = Column('df1', 'col2', ColumnType.INTEGER, [1, 2, 3], ['n/a', 'na']) @@ -112,4 +112,4 @@ def describe_database(self): # print(col3 == col4) # if __name__ == "__main__": -# main() \ No newline at end of file +# main() diff --git a/bdikit/data_ingestion/dataset_loader.py b/bdikit/data_ingestion/dataset_loader.py index 46a9e28f..a7646777 100644 --- a/bdikit/data_ingestion/dataset_loader.py +++ b/bdikit/data_ingestion/dataset_loader.py @@ -4,4 +4,4 @@ def load_dataframe(dataset_path): dataset = pd.read_csv(dataset_path) - return dataset \ No newline at end of file + return dataset diff --git a/bdikit/download.py b/bdikit/download.py index b1b8b912..10765082 100644 --- a/bdikit/download.py +++ b/bdikit/download.py @@ -60,7 +60,7 @@ def get_cached_model_or_download(model_name: str): if len(sys.argv) < 2: print("Please provide a model_id as a command line argument.") sys.exit(1) - + model_id = sys.argv[1] model_path = get_cached_model_or_download(model_id) print(f"Downloaded model: {model_path}") diff --git a/bdikit/mapping_algorithms/column_mapping/algorithms.py b/bdikit/mapping_algorithms/column_mapping/algorithms.py index 988cfec7..b9b069cc 100644 --- a/bdikit/mapping_algorithms/column_mapping/algorithms.py +++ b/bdikit/mapping_algorithms/column_mapping/algorithms.py @@ -1,10 +1,15 @@ from valentine import valentine_match -from valentine.algorithms import SimilarityFlooding,Coma,Cupid,DistributionBased,JaccardDistanceMatcher +from valentine.algorithms import ( + SimilarityFlooding, + Coma, + Cupid, + DistributionBased, + JaccardDistanceMatcher, +) from openai import OpenAI -class BaseColumnMappingAlgorithm(): - +class BaseColumnMappingAlgorithm: def __init__(self, dataset, global_table): self._dataset = dataset self._global_table = global_table @@ -14,10 +19,9 @@ def map(self): class SimFloodAlgorithm(BaseColumnMappingAlgorithm): - def __init__(self, dataset, global_table): super().__init__(dataset, global_table) - + def map(self): matcher = SimilarityFlooding() matches = valentine_match(self._dataset, self._global_table, matcher) @@ -26,14 +30,14 @@ def map(self): for match in matches.one_to_one(): dataset_candidate = match[0][1] global_table_candidate = match[1][1] - mappings[dataset_candidate] = global_table_candidate + mappings[dataset_candidate] = global_table_candidate return mappings - -class ComaAlgorithm(BaseColumnMappingAlgorithm): + +class ComaAlgorithm(BaseColumnMappingAlgorithm): def __init__(self, dataset, global_table): super().__init__(dataset, global_table) - + def map(self): matcher = Coma() matches = valentine_match(self._dataset, self._global_table, matcher) @@ -42,67 +46,66 @@ def map(self): for match in matches.one_to_one(): dataset_candidate = match[0][1] global_table_candidate = match[1][1] - mappings[dataset_candidate] = global_table_candidate + mappings[dataset_candidate] = global_table_candidate return mappings - + + class CupidAlgorithm(BaseColumnMappingAlgorithm): - - def __init__(self, dataset, global_table): - super().__init__(dataset, global_table) - - def map(self): - matcher = Cupid() - matches = valentine_match(self._dataset, self._global_table, matcher) - - mappings = {} - for match in matches.one_to_one(): - dataset_candidate = match[0][1] - global_table_candidate = match[1][1] - mappings[dataset_candidate] = global_table_candidate - return mappings - + def __init__(self, dataset, global_table): + super().__init__(dataset, global_table) + + def map(self): + matcher = Cupid() + matches = valentine_match(self._dataset, self._global_table, matcher) + + mappings = {} + for match in matches.one_to_one(): + dataset_candidate = match[0][1] + global_table_candidate = match[1][1] + mappings[dataset_candidate] = global_table_candidate + return mappings + + class DistributionBasedAlgorithm(BaseColumnMappingAlgorithm): - - def __init__(self, dataset, global_table): - super().__init__(dataset, global_table) - - def map(self): - matcher = DistributionBased() - matches = valentine_match(self._dataset, self._global_table, matcher) - - mappings = {} - for match in matches.one_to_one(): - dataset_candidate = match[0][1] - global_table_candidate = match[1][1] - mappings[dataset_candidate] = global_table_candidate - return mappings - + def __init__(self, dataset, global_table): + super().__init__(dataset, global_table) + + def map(self): + matcher = DistributionBased() + matches = valentine_match(self._dataset, self._global_table, matcher) + + mappings = {} + for match in matches.one_to_one(): + dataset_candidate = match[0][1] + global_table_candidate = match[1][1] + mappings[dataset_candidate] = global_table_candidate + return mappings + + class JaccardDistanceAlgorithm(BaseColumnMappingAlgorithm): - - def __init__(self, dataset, global_table): - super().__init__(dataset, global_table) - - def map(self): - matcher = JaccardDistanceMatcher() - matches = valentine_match(self._dataset, self._global_table, matcher) - - mappings = {} - for match in matches.one_to_one(): - dataset_candidate = match[0][1] - global_table_candidate = match[1][1] - mappings[dataset_candidate] = global_table_candidate - return mappings - + def __init__(self, dataset, global_table): + super().__init__(dataset, global_table) + + def map(self): + matcher = JaccardDistanceMatcher() + matches = valentine_match(self._dataset, self._global_table, matcher) + + mappings = {} + for match in matches.one_to_one(): + dataset_candidate = match[0][1] + global_table_candidate = match[1][1] + mappings[dataset_candidate] = global_table_candidate + return mappings + class GPTAlgorithm(BaseColumnMappingAlgorithm): - def __init__(self, dataset, global_table): super().__init__(dataset, global_table) self.client = OpenAI() - + def map(self): global_columns = self._global_table.columns - labels = ', '.join(global_columns) + labels = ", ".join(global_columns) candidate_columns = self._dataset.columns mappings = {} for column in candidate_columns: @@ -121,20 +124,23 @@ def map(self): break return mappings - - def get_column_type(self, context, labels, m=10, model="gpt-4-turbo-preview"): - messages=[ - { - "role": "system", - "content": "You are an assistant for column matching."}, - { - "role": "user", - "content": """ Please select the top """ + str(m) + """ class from """ + labels + """ which best describes the context. The context is defined by the column name followed by its respective values. Please respond only with the name of the classes separated by semicolon. - \n CONTEXT: """ + context + """ \n RESPONSE: \n"""}, - ] - col_type = self.client.chat.completions.create(model=model, - messages=messages, - temperature=0.3) + messages = [ + {"role": "system", "content": "You are an assistant for column matching."}, + { + "role": "user", + "content": """ Please select the top """ + + str(m) + + """ class from """ + + labels + + """ which best describes the context. The context is defined by the column name followed by its respective values. Please respond only with the name of the classes separated by semicolon. + \n CONTEXT: """ + + context + + """ \n RESPONSE: \n""", + }, + ] + col_type = self.client.chat.completions.create( + model=model, messages=messages, temperature=0.3 + ) col_type_content = col_type.choices[0].message.content - return col_type_content.split(";") \ No newline at end of file + return col_type_content.split(";") diff --git a/bdikit/mapping_algorithms/scope_reducing/_algorithms/contrastive_learning/cl_api.py b/bdikit/mapping_algorithms/scope_reducing/_algorithms/contrastive_learning/cl_api.py index aabb7ec5..4d7d10b6 100644 --- a/bdikit/mapping_algorithms/scope_reducing/_algorithms/contrastive_learning/cl_api.py +++ b/bdikit/mapping_algorithms/scope_reducing/_algorithms/contrastive_learning/cl_api.py @@ -4,10 +4,12 @@ import numpy as np import pandas as pd import torch -from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_models import \ - BarlowTwinsSimCLR -from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_pretrained_dataset import \ - PretrainTableDataset +from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_models import ( + BarlowTwinsSimCLR, +) +from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_pretrained_dataset import ( + PretrainTableDataset, +) from sklearn.metrics.pairwise import cosine_similarity from tqdm import tqdm diff --git a/bdikit/mapping_algorithms/scope_reducing/_algorithms/contrastive_learning/cl_pretrained_dataset.py b/bdikit/mapping_algorithms/scope_reducing/_algorithms/contrastive_learning/cl_pretrained_dataset.py index 1bbbc47d..f48e9f6e 100644 --- a/bdikit/mapping_algorithms/scope_reducing/_algorithms/contrastive_learning/cl_pretrained_dataset.py +++ b/bdikit/mapping_algorithms/scope_reducing/_algorithms/contrastive_learning/cl_pretrained_dataset.py @@ -1,7 +1,8 @@ import pandas as pd import torch -from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_preprocessor import \ - preprocess +from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_preprocessor import ( + preprocess, +) from torch.utils import data from transformers import AutoTokenizer diff --git a/bdikit/mapping_algorithms/scope_reducing/algorithms.py b/bdikit/mapping_algorithms/scope_reducing/algorithms.py index f371a4e9..dd3fcd7f 100644 --- a/bdikit/mapping_algorithms/scope_reducing/algorithms.py +++ b/bdikit/mapping_algorithms/scope_reducing/algorithms.py @@ -1,6 +1,7 @@ import pandas as pd -from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import \ - ContrastiveLearningAPI +from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import ( + ContrastiveLearningAPI, +) from bdikit.download import get_cached_model_or_download @@ -13,10 +14,9 @@ def reduce_scope(self, dataset: pd.DataFrame): class YurongReducer(BaseReducer): - def __init__(self): super().__init__() - model_path = get_cached_model_or_download('cl-reducer-v0.1') + model_path = get_cached_model_or_download("cl-reducer-v0.1") self.api = ContrastiveLearningAPI(model_path=model_path, top_k=20) def reduce_scope(self, dataset: pd.DataFrame): diff --git a/bdikit/mapping_algorithms/value_mapping/algorithms.py b/bdikit/mapping_algorithms/value_mapping/algorithms.py index a3118888..1872a117 100644 --- a/bdikit/mapping_algorithms/value_mapping/algorithms.py +++ b/bdikit/mapping_algorithms/value_mapping/algorithms.py @@ -5,21 +5,20 @@ from flair.embeddings import TransformerWordEmbeddings -class BaseAlgorithm(): - +class BaseAlgorithm: def __init__(self, *args): pass - + def match(self, current_values, target_values, threshold=0.8): self.model.match(current_values, target_values) match_results = self.model.get_matches() - match_results.sort_values(by='Similarity', ascending=False, inplace=True) + match_results.sort_values(by="Similarity", ascending=False, inplace=True) matches = [] for _, row in match_results.iterrows(): - current_value = row['From'] - target_value = row['To'] - similarity = row['Similarity'] + current_value = row["From"] + target_value = row["To"] + similarity = row["Similarity"] if similarity >= threshold: matches.append((current_value, target_value, similarity)) @@ -27,11 +26,10 @@ def match(self, current_values, target_values, threshold=0.8): class TFIDFAlgorithm(BaseAlgorithm): - def __init__(self): method = TFIDF(min_similarity=0) self.model = PolyFuzz(method) - + def match(self, current_values, target_values, threshold=0.8): matches = super().match(current_values, target_values, threshold) @@ -39,11 +37,10 @@ def match(self, current_values, target_values, threshold=0.8): class EditAlgorithm(BaseAlgorithm): - def __init__(self): method = EditDistance(n_jobs=-1) self.model = PolyFuzz(method) - + def match(self, current_values, target_values, threshold=0.8): matches = super().match(current_values, target_values, threshold) @@ -51,12 +48,11 @@ def match(self, current_values, target_values, threshold=0.8): class EmbeddingAlgorithm(BaseAlgorithm): - - def __init__(self, model_path='bert-base-multilingual-cased'): + def __init__(self, model_path="bert-base-multilingual-cased"): embeddings = TransformerWordEmbeddings(model_path) - method = Embeddings(embeddings, min_similarity=0, model_id='embedding_model') + method = Embeddings(embeddings, min_similarity=0, model_id="embedding_model") self.model = PolyFuzz(method) - + def match(self, current_values, target_values, threshold=0.8): matches = super().match(current_values, target_values, threshold) @@ -64,34 +60,42 @@ def match(self, current_values, target_values, threshold=0.8): class LLMAlgorithm(BaseAlgorithm): - def __init__(self): self.client = OpenAI() - + def match(self, current_values, target_values, threshold=0.8): target_values_set = set(target_values) matches = [] for current_value in current_values: completion = self.client.chat.completions.create( - model='gpt-4-turbo-preview', - messages=[ - {'role': 'system', 'content': 'You are an intelligent system that given a term, you have to choose a value from a list that best matches the term. ' - 'These terms belong to the medical domain, and the list contains terms in the Genomics Data Commons (GDC) format.'}, - {'role': 'user', 'content': f'For the term: "{current_value}", choose a value from this list {target_values}. ' - 'Return the value from the list with a similarity score, between 0 and 1, with 1 indicating the highest similarity. ' - 'DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. ' - 'Only provide a Python dictionary. For example {"term": "term from the list", "score": 0.8}.'} - ] + model="gpt-4-turbo-preview", + messages=[ + { + "role": "system", + "content": "You are an intelligent system that given a term, you have to choose a value from a list that best matches the term. " + "These terms belong to the medical domain, and the list contains terms in the Genomics Data Commons (GDC) format.", + }, + { + "role": "user", + "content": f'For the term: "{current_value}", choose a value from this list {target_values}. ' + "Return the value from the list with a similarity score, between 0 and 1, with 1 indicating the highest similarity. " + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. " + 'Only provide a Python dictionary. For example {"term": "term from the list", "score": 0.8}.', + }, + ], ) - + response_message = completion.choices[0].message.content try: response_dict = ast.literal_eval(response_message) - if response_dict['term'] in target_values_set: - matches.append((current_value, response_dict['term'], response_dict['score'])) + if response_dict["term"] in target_values_set: + matches.append( + (current_value, response_dict["term"], response_dict["score"]) + ) except: - print(f'Errors parsing response for "{current_value}": {response_message}') + print( + f'Errors parsing response for "{current_value}": {response_message}' + ) return matches - diff --git a/bdikit/mapping_recommendation/column_mapping_manager.py b/bdikit/mapping_recommendation/column_mapping_manager.py index abf1e62f..830c395a 100644 --- a/bdikit/mapping_recommendation/column_mapping_manager.py +++ b/bdikit/mapping_recommendation/column_mapping_manager.py @@ -1,6 +1,14 @@ -from bdikit.mapping_algorithms.column_mapping.algorithms import SimFloodAlgorithm, ComaAlgorithm, CupidAlgorithm, DistributionBasedAlgorithm, JaccardDistanceAlgorithm, GPTAlgorithm +from bdikit.mapping_algorithms.column_mapping.algorithms import ( + SimFloodAlgorithm, + ComaAlgorithm, + CupidAlgorithm, + DistributionBasedAlgorithm, + JaccardDistanceAlgorithm, + GPTAlgorithm, +) from enum import Enum + class MappingAlgorithm(Enum): SIMFLOOD = "SimFloodAlgorithm" COMA = "ComaAlgorithm" @@ -9,14 +17,21 @@ class MappingAlgorithm(Enum): JACCARD_DISTANCE = "JaccardDistanceAlgorithm" GPT = "GPTAlgorithm" - -class ColumnMappingManager(): - def __init__(self, dataset, global_table, algorithm=MappingAlgorithm.SIMFLOOD.value): - self._dataset = dataset #TODO: move into database object (in data_ingestion folder) - self._global_table = global_table #TODO: move into database object (in data_ingestion folder) - self._reduced_scope = None #TODO: move into database object (in data_ingestion folder) - self.mapping_algorithm = algorithm +class ColumnMappingManager: + def __init__( + self, dataset, global_table, algorithm=MappingAlgorithm.SIMFLOOD.value + ): + self._dataset = ( + dataset # TODO: move into database object (in data_ingestion folder) + ) + self._global_table = ( + global_table # TODO: move into database object (in data_ingestion folder) + ) + self._reduced_scope = ( + None # TODO: move into database object (in data_ingestion folder) + ) + self.mapping_algorithm = algorithm @property def reduced_scope(self): @@ -28,22 +43,23 @@ def reduced_scope(self, value): @property def dataset(self): - return self._dataset - + return self._dataset + @property def global_table(self): return self._global_table - + def map(self): if self.reduced_scope is None: - mapping_algorithm_instance = eval(self.mapping_algorithm)(self.dataset, self.global_table) - mappings = mapping_algorithm_instance.map() + mapping_algorithm_instance = eval(self.mapping_algorithm)( + self.dataset, self.global_table + ) + mappings = mapping_algorithm_instance.map() return mappings else: # For each reduction suggestion, we build a new dataset and global table and run the mapping algorithm mappings = {} for reduction in self.reduced_scope: - dataset_column = reduction["Candidate column"] global_table_columns = [x[0] for x in reduction["Top k columns"]] @@ -52,17 +68,19 @@ def map(self): else: continue - common_cols = set(global_table_columns).intersection(self.global_table.columns) + common_cols = set(global_table_columns).intersection( + self.global_table.columns + ) reduced_global_table = self.global_table[list(common_cols)] - - mapping_algorithm_instance = eval(self.mapping_algorithm)(reduced_dataset, reduced_global_table) - partial_mappings = mapping_algorithm_instance.map() - - if len(partial_mappings.keys())>0: + + mapping_algorithm_instance = eval(self.mapping_algorithm)( + reduced_dataset, reduced_global_table + ) + partial_mappings = mapping_algorithm_instance.map() + + if len(partial_mappings.keys()) > 0: candidate_col = next(iter(partial_mappings)) target_col = partial_mappings[candidate_col] mappings[candidate_col] = target_col - + return mappings - - diff --git a/bdikit/mapping_recommendation/value_mapping_manager.py b/bdikit/mapping_recommendation/value_mapping_manager.py index b1081399..66b0fcf7 100644 --- a/bdikit/mapping_recommendation/value_mapping_manager.py +++ b/bdikit/mapping_recommendation/value_mapping_manager.py @@ -1,7 +1,12 @@ import pandas as pd from enum import Enum import matplotlib.pyplot as plt -from bdikit.mapping_algorithms.value_mapping.algorithms import TFIDFAlgorithm, LLMAlgorithm, EditAlgorithm, EmbeddingAlgorithm +from bdikit.mapping_algorithms.value_mapping.algorithms import ( + TFIDFAlgorithm, + LLMAlgorithm, + EditAlgorithm, + EmbeddingAlgorithm, +) class MappingAlgorithm(Enum): @@ -11,28 +16,35 @@ class MappingAlgorithm(Enum): LLM = "LLMAlgorithm" -class ValueMappingManager(): - - def __init__(self, dataset, column_mapping, target_domain, algorithm=MappingAlgorithm.EDIT.value): +class ValueMappingManager: + def __init__( + self, + dataset, + column_mapping, + target_domain, + algorithm=MappingAlgorithm.EDIT.value, + ): self.matcher_method = eval(algorithm)() self.dataset = dataset self.column_mapping = column_mapping self.target_domain = target_domain self.mapping_results = None - + def calculate_coverage(self): if self.mapping_results is None: self._match_values() - sorted_results = sorted(self.mapping_results.items(), key=lambda x: x[1]['coverage'], reverse=True) + sorted_results = sorted( + self.mapping_results.items(), key=lambda x: x[1]["coverage"], reverse=True + ) total = 0 for column_name, match_data in sorted_results: - coverage = match_data['coverage'] * 100 + coverage = match_data["coverage"] * 100 total += coverage - print(f'Column {column_name}: {coverage:.2f}%') - + print(f"Column {column_name}: {coverage:.2f}%") + total = total / len(sorted_results) - print(f'Total: {total:.2f}%') + print(f"Total: {total:.2f}%") def skip_values(self, values): max_length = 50 @@ -43,52 +55,68 @@ def skip_values(self, values): else: return False - def map(self): if self.mapping_results is None: self._match_values() return self.mapping_results - def _match_values(self): self.mapping_results = {} for current_column, target_column in self.column_mapping.items(): if self.target_domain[target_column] is None: continue - target_values_dict = {x.lower(): x for x in self.target_domain[target_column]} + target_values_dict = { + x.lower(): x for x in self.target_domain[target_column] + } unique_values = self.dataset[current_column].unique() if self.skip_values(unique_values): continue - current_values_dict = {str(x).strip().lower(): str(x).strip() for x in unique_values} - self.mapping_results[current_column] = {'matches': None, 'coverage': None, - 'unique_values': None, 'unmatch_values': None} - - matches_lowercase = self.matcher_method.match(list(current_values_dict.keys()), list(target_values_dict.keys())) + current_values_dict = { + str(x).strip().lower(): str(x).strip() for x in unique_values + } + self.mapping_results[current_column] = { + "matches": None, + "coverage": None, + "unique_values": None, + "unmatch_values": None, + } + + matches_lowercase = self.matcher_method.match( + list(current_values_dict.keys()), list(target_values_dict.keys()) + ) matches = [] for current_value, target_value, similarity in matches_lowercase: - matches.append((current_values_dict[current_value], target_values_dict[target_value], similarity)) + matches.append( + ( + current_values_dict[current_value], + target_values_dict[target_value], + similarity, + ) + ) coverage = len(matches) / len(current_values_dict) current_values = set(current_values_dict.values()) match_values = set([x[0] for x in matches]) - self.mapping_results[current_column]['matches'] = matches - self.mapping_results[current_column]['coverage'] = coverage - self.mapping_results[current_column]['unique_values'] = current_values - self.mapping_results[current_column]['unmatch_values'] = current_values - match_values + self.mapping_results[current_column]["matches"] = matches + self.mapping_results[current_column]["coverage"] = coverage + self.mapping_results[current_column]["unique_values"] = current_values + self.mapping_results[current_column]["unmatch_values"] = ( + current_values - match_values + ) def plot_unique_values(self): unique_counts = self.dataset.nunique() - + # Plot the number of unique values for each column plt.figure(figsize=(10, 6)) - unique_counts.plot(kind='bar', color='skyblue') - plt.xlabel('Columns') - plt.ylabel('Number of Unique Values') + unique_counts.plot(kind="bar", color="skyblue") + plt.xlabel("Columns") + plt.ylabel("Number of Unique Values") plt.xticks(rotation=90) - plt.grid(axis='y', linestyle='--', alpha=0.7) + plt.grid(axis="y", linestyle="--", alpha=0.7) plt.tight_layout() plt.show() diff --git a/bdikit/transformation/transformer.py b/bdikit/transformation/transformer.py index 9e128815..e8508fba 100644 --- a/bdikit/transformation/transformer.py +++ b/bdikit/transformation/transformer.py @@ -8,9 +8,8 @@ def __init__(self, schema_path): if not self.is_schema_valid(schema): raise ValueError("Invalid schema") self.schema = schema - - def transform(self, df: pd.DataFrame): + def transform(self, df: pd.DataFrame): for column in self.schema: if column not in df.columns: continue @@ -24,9 +23,9 @@ def transform(self, df: pd.DataFrame): df[column] = df[column].str.lower().map(values).fillna(default) else: df[column] = df[column].map(values).fillna(default) - + return df @staticmethod def is_schema_valid(schema): - return True \ No newline at end of file + return True diff --git a/bdikit/utils.py b/bdikit/utils.py index 9a98316e..f55c6e1e 100644 --- a/bdikit/utils.py +++ b/bdikit/utils.py @@ -1,7 +1,7 @@ import json from os.path import join, dirname -GDC_SCHEMA_PATH = join(dirname(__file__), './resource/gdc_schema.json') +GDC_SCHEMA_PATH = join(dirname(__file__), "./resource/gdc_schema.json") def read_gdc_schema(): @@ -24,21 +24,22 @@ def get_gdc_data(column_names): def get_gdc_values(column_name, gdc_schema): for key, values in gdc_schema.items(): - for key in values['properties'].keys(): + for key in values["properties"].keys(): if column_name == key: - value_metadata = values['properties'][column_name] + value_metadata = values["properties"][column_name] if "enum" in value_metadata: - return value_metadata['enum'] - elif "type" in value_metadata and value_metadata['type'] == 'number': + return value_metadata["enum"] + elif "type" in value_metadata and value_metadata["type"] == "number": return None return None + def get_all_gdc_columns(): all_columns = [] gdc_schema = read_gdc_schema() for key, values in gdc_schema.items(): - for key in values['properties'].keys(): + for key in values["properties"].keys(): all_columns.append(key) return all_columns @@ -48,7 +49,7 @@ def get_gdc_metadata(): gdc_schema = read_gdc_schema() for key, values in gdc_schema.items(): - for key, data in values['properties'].items(): + for key, data in values["properties"].items(): metadata[key] = data return metadata diff --git a/bdikit/visualization/mappings.py b/bdikit/visualization/mappings.py index 1ae700ad..b85bba04 100644 --- a/bdikit/visualization/mappings.py +++ b/bdikit/visualization/mappings.py @@ -3,7 +3,8 @@ from IPython.display import display from bdikit.utils import get_gdc_metadata -pd.set_option('display.max_colwidth', None) +pd.set_option("display.max_colwidth", None) + def plot_reduce_scope(reduced_scope, num_columns=5, num_candidates=5, max_chars=150): gdc_metadata = get_gdc_metadata() @@ -12,45 +13,63 @@ def plot_reduce_scope(reduced_scope, num_columns=5, num_candidates=5, max_chars= num_columns = len(reduced_scope) if num_candidates is None: - num_candidates = len(reduced_scope[0]['Top k columns']) + num_candidates = len(reduced_scope[0]["Top k columns"]) for column_data in reduced_scope[:num_columns]: - column_name = column_data['Candidate column'] + column_name = column_data["Candidate column"] recommendations = [] - for candidate_name, candidate_similarity in column_data['Top k columns'][:num_candidates]: - candidate_description = gdc_metadata[candidate_name].get('description', '') + for candidate_name, candidate_similarity in column_data["Top k columns"][ + :num_candidates + ]: + candidate_description = gdc_metadata[candidate_name].get("description", "") candidate_description = truncate_text(candidate_description, max_chars) - candidate_values = ', '.join(gdc_metadata[candidate_name].get('enum', [])) + candidate_values = ", ".join(gdc_metadata[candidate_name].get("enum", [])) candidate_values = truncate_text(candidate_values, max_chars) - recommendations.append((candidate_name, candidate_similarity, candidate_description, candidate_values)) + recommendations.append( + ( + candidate_name, + candidate_similarity, + candidate_description, + candidate_values, + ) + ) - print(f'\n{column_name}:') - candidates_df = pd.DataFrame(recommendations, columns=['Candidate', 'Similarity', 'Description', 'Values (sample)']) + print(f"\n{column_name}:") + candidates_df = pd.DataFrame( + recommendations, + columns=["Candidate", "Similarity", "Description", "Values (sample)"], + ) display(candidates_df) def plot_column_mappings(column_mappings): - column_mappings_df = pd.DataFrame(column_mappings.items(), columns=['Original Column', 'Target Column']) + column_mappings_df = pd.DataFrame( + column_mappings.items(), columns=["Original Column", "Target Column"] + ) display(column_mappings_df) def plot_value_mappings(value_mappings, include_unmatches=True): - sorted_results = sorted(value_mappings.items(), key=lambda x: x[1]['coverage'], reverse=True) + sorted_results = sorted( + value_mappings.items(), key=lambda x: x[1]["coverage"], reverse=True + ) for column_name, _ in sorted_results: - matches = deepcopy(value_mappings[column_name]['matches']) - print(f'\nColumn {column_name}:') + matches = deepcopy(value_mappings[column_name]["matches"]) + print(f"\nColumn {column_name}:") if include_unmatches: - for unmatch_value in value_mappings[column_name]['unmatch_values']: - matches.append((unmatch_value, '-', '-')) - - matches_df = pd.DataFrame(matches, columns=['Current Value', 'Target Value', 'Similarity']) + for unmatch_value in value_mappings[column_name]["unmatch_values"]: + matches.append((unmatch_value, "-", "-")) + + matches_df = pd.DataFrame( + matches, columns=["Current Value", "Target Value", "Similarity"] + ) display(matches_df) def truncate_text(text, max_chars): if len(text) > max_chars: - return text[:max_chars] + '...' + return text[:max_chars] + "..." else: return text diff --git a/examples/column_and_value_mapping.py b/examples/column_and_value_mapping.py index b844e6e3..aab4bd6f 100644 --- a/examples/column_and_value_mapping.py +++ b/examples/column_and_value_mapping.py @@ -4,13 +4,13 @@ manager = APIManager() -dataset_path = './datasets/dou.csv' +dataset_path = "./datasets/dou.csv" dataset = manager.load_dataset(dataset_path) -print('Dataset:') +print("Dataset:") print(dataset) -print('Reduced scope:') +print("Reduced scope:") reduced_scope = manager.reduce_scope() -print('Column mappings:') +print("Column mappings:") column_mappings = manager.map_columns() -print('Value mappings:') +print("Value mappings:") value_mappings = manager.map_values() diff --git a/setup.py b/setup.py index a33bef63..bd81d264 100644 --- a/setup.py +++ b/setup.py @@ -2,29 +2,31 @@ import setuptools -package_name = 'bdi-kit' -package_dir = 'bdikit' +package_name = "bdi-kit" +package_dir = "bdikit" def read_readme(): - with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf8') as file: + with open( + os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf8" + ) as file: return file.read() def read_version(): - module_path = os.path.join(package_dir, '__init__.py') + module_path = os.path.join(package_dir, "__init__.py") with open(module_path) as file: for line in file: - parts = line.strip().split(' ') - if parts and parts[0] == '__version__': + parts = line.strip().split(" ") + if parts and parts[0] == "__version__": return parts[-1].strip("'") - raise KeyError('Version not found in {0}'.format(module_path)) + raise KeyError("Version not found in {0}".format(module_path)) def get_requires(): - with open('requirements.txt') as fp: - dependencies = [line for line in fp if line and not line.startswith('#')] + with open("requirements.txt") as fp: + dependencies = [line for line in fp if line and not line.startswith("#")] return dependencies @@ -42,18 +44,19 @@ def get_requires(): extras_require=extra_requires, description="bdi-kit library", long_description=long_description, - long_description_content_type='text/markdown', - url='https://github.com/VIDA-NYU/bdi-kit', + long_description_content_type="text/markdown", + url="https://github.com/VIDA-NYU/bdi-kit", include_package_data=True, - author='', - author_email='', - maintainer='', - maintainer_email='', - keywords=['askem', 'data integration', 'nyu'], - license='Apache-2.0', + author="", + author_email="", + maintainer="", + maintainer_email="", + keywords=["askem", "data integration", "nyu"], + license="Apache-2.0", classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: Apache Software License', - 'Topic :: Scientific/Engineering', - ]) \ No newline at end of file + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Topic :: Scientific/Engineering", + ], +)