# Python import multiprocessing from experiments import run_experiment_filter_schemes, run_reduction_experiment, run_scalability_experiment, run_matching_without_silkmoth_inc_dep import os from data_loader import DataLoader from utils import load_sets_from_files from src.silkmoth.utils import jaccard_similarity, contain, similar, SigType, edit_similarity def run_experiment_multi(experiment_method, *args): experiment_method(*args) if __name__ == "__main__": data_loader = DataLoader("/") # Labels for Filter Experiments labels_filter = ["NO FILTER", "CHECK FILTER", "NN FILTER"] # Labels for Signature Scheme labels_sig_schemes = [SigType.WEIGHTED, SigType.SKYLINE, SigType.DICHOTOMY] # Labels for Reduction labels_reduction = ["REDUCTION", "NO REDUCTION"] # Load the datasets for Experiments data_path = os.path.join(os.path.dirname(__file__), "data", "dblp", "DBLP_100k.csv") source_string_matching = data_loader.load_dblp_titles(data_path) source_string_matching = [title.split() for title in source_string_matching] try: folder_path = os.path.join(os.path.dirname(__file__), "../experiments/data/webtables") folder_path = os.path.normpath(folder_path) reference_sets_in_dep, source_sets_in_dep = load_sets_from_files( folder_path=folder_path, reference_file="reference_sets_inclusion_dependency.json", source_file="source_sets_inclusion_dependency.json" ) reference_sets_schema_matching, source_sets_schema_matching = load_sets_from_files( folder_path=folder_path, reference_file="webtable_schemas_sets_500k.json", source_file="webtable_schemas_sets_500k.json" ) del reference_sets_schema_matching _, github_source_sets_schema_matching = load_sets_from_files( folder_path=folder_path, reference_file="github_webtable_schemas_sets_500k.json", source_file="github_webtable_schemas_sets_500k.json" ) except FileNotFoundError: print("Datasets not found. Skipping Experiments.") reference_sets_in_dep, source_sets_in_dep, reference_sets_in_dep_reduction = [], [], [] source_sets_schema_matching = [] github_source_sets_schema_matching = [] # Experiment configuration experiment_config = { "filter_runs": False, "signature_scheme_runs": False, "reduction_runs": False, "scalability_runs": False, "schema_github_webtable_runs": False, "inc_dep_without_silkmoth": True } # Define experiments to run experiments = [] if experiment_config["filter_runs"]: # Filter runs # String Matching Experiment experiments.append(( run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.7, 0.75, 0.8, 0.85], labels_filter, source_string_matching[:10_000], None, similar, edit_similarity , False, "string_matching_filter", "results/string_matching/" )) # Schema Matching Experiment experiments.append(( run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75], labels_filter, source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False, "schema_matching_filter", "results/schema_matching/" )) # Inclusion Dependency Experiment experiments.append(( run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75], labels_filter, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True, "inclusion_dependency_filter", "results/inclusion_dependency/" )) if experiment_config["signature_scheme_runs"]: # Signature Scheme Runs #String Matching Experiment experiments.append(( run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.7, 0.75, 0.8, 0.85], labels_sig_schemes, source_string_matching[:10_000], None, similar, edit_similarity , False, "string_matching_sig", "results/string_matching/" )) # Schema Matching Experiment experiments.append(( run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75], labels_sig_schemes, source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False, "schema_matching_sig", "results/schema_matching/" )) # Inclusion Dependency Experiment experiments.append(( run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75], labels_sig_schemes, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True, "inclusion_dependency_sig", "results/inclusion_dependency/" )) if experiment_config["reduction_runs"]: # Reduction Runs experiments.append(( run_reduction_experiment, [0.7, 0.75, 0.8, 0.85], 0.0, labels_reduction, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True, "inclusion_dependency_reduction", "results/inclusion_dependency/" )) if experiment_config["scalability_runs"]: # Scalability Runs # String Matching experiments.append(( run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.7, [1_000, 10_000, 100_000], source_string_matching[:100_000], None, similar, edit_similarity, False, "string_matching_scalability", "results/string_matching/" )) # Inclusion Dependency experiments.append(( run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.5, [100_000, 200_000, 300_000, 400_000, 500_000], source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True, "inclusion_dependency_scalability", "results/inclusion_dependency/" )) # Schema Matching experiments.append(( run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.0, [12_000, 24_000, 36_000, 48_000, 60_000], source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False, "schema_matching_scalability", "results/schema_matching/" )) if experiment_config["schema_github_webtable_runs"]: # Schema Matching with GitHub Webtable Schemas experiments.append(( run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75], labels_filter, source_sets_schema_matching[:10_000], github_source_sets_schema_matching[:10_000], similar, jaccard_similarity, True, "github_webtable_schema_matching", "results/schema_matching/" )) if experiment_config["inc_dep_without_silkmoth"]: experiments.append(( run_matching_without_silkmoth_inc_dep, source_sets_in_dep[:500_000], reference_sets_in_dep[:200], [0.7, 0.75, 0.8, 0.85], 0.5, contain, jaccard_similarity, "raw_matching", "results/inclusion_dependency/" )) # Create and start processes for each experiment processes = [] for experiment in experiments: method, *args = experiment process = multiprocessing.Process(target=run_experiment_multi, args=(method, *args)) processes.append(process) process.start() # Wait for all processes to complete for process in processes: process.join()