SilkMoth/experiments/run.py

# Python
import multiprocessing
from experiments import run_experiment_filter_schemes, run_reduction_experiment, run_scalability_experiment, run_matching_without_silkmoth_inc_dep
import os
from data_loader import DataLoader
from utils import load_sets_from_files
from src.silkmoth.utils import jaccard_similarity, contain, similar, SigType, edit_similarity


def run_experiment_multi(experiment_method, *args):
    experiment_method(*args)


if __name__ == "__main__":
    data_loader = DataLoader("/")

    # Labels for Filter Experiments
    labels_filter = ["NO FILTER", "CHECK FILTER", "NN FILTER"]

    # Labels for Signature Scheme
    labels_sig_schemes = [SigType.WEIGHTED, SigType.SKYLINE, SigType.DICHOTOMY]

    # Labels for Reduction
    labels_reduction = ["REDUCTION", "NO REDUCTION"]

    # Load the datasets for Experiments
    data_path = os.path.join(os.path.dirname(__file__), "data", "dblp", "DBLP_100k.csv")
    source_string_matching = data_loader.load_dblp_titles(data_path)
    source_string_matching = [title.split() for title in source_string_matching]

    try:
        folder_path = os.path.join(os.path.dirname(__file__), "../experiments/data/webtables")
        folder_path = os.path.normpath(folder_path)
        reference_sets_in_dep, source_sets_in_dep = load_sets_from_files(
            folder_path=folder_path,
            reference_file="reference_sets_inclusion_dependency.json",
            source_file="source_sets_inclusion_dependency.json"
        )

        reference_sets_schema_matching, source_sets_schema_matching = load_sets_from_files(
            folder_path=folder_path,
            reference_file="webtable_schemas_sets_500k.json",
            source_file="webtable_schemas_sets_500k.json"
        )
        del reference_sets_schema_matching

        _, github_source_sets_schema_matching = load_sets_from_files(
            folder_path=folder_path,
            reference_file="github_webtable_schemas_sets_500k.json",
            source_file="github_webtable_schemas_sets_500k.json"
        )

    except FileNotFoundError:
        print("Datasets not found. Skipping Experiments.")
        reference_sets_in_dep, source_sets_in_dep, reference_sets_in_dep_reduction = [], [], []
        source_sets_schema_matching = []
        github_source_sets_schema_matching = []

    # Experiment configuration
    experiment_config = {
        "filter_runs": False,
        "signature_scheme_runs": False,
        "reduction_runs": False,
        "scalability_runs": False,
        "schema_github_webtable_runs": False,
        "inc_dep_without_silkmoth": True
    }

    # Define experiments to run
    experiments = []

    if experiment_config["filter_runs"]:
        # Filter runs
        # String Matching Experiment
        experiments.append((
            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.7, 0.75, 0.8, 0.85],
            labels_filter, source_string_matching[:10_000], None, similar, edit_similarity , False,
            "string_matching_filter", "results/string_matching/"
        ))

        # Schema Matching Experiment
        experiments.append((
            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
            labels_filter, source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False,
            "schema_matching_filter", "results/schema_matching/"
        ))

        # Inclusion Dependency Experiment
        experiments.append((
            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
            labels_filter, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
            "inclusion_dependency_filter", "results/inclusion_dependency/"
        ))


    if experiment_config["signature_scheme_runs"]:
        # Signature Scheme Runs
        #String Matching Experiment
        experiments.append((
            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.7, 0.75, 0.8, 0.85],
            labels_sig_schemes, source_string_matching[:10_000], None, similar, edit_similarity , False,
            "string_matching_sig", "results/string_matching/"
        ))

        # Schema Matching Experiment
        experiments.append((
            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
            labels_sig_schemes, source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False,
            "schema_matching_sig", "results/schema_matching/"
        ))

        # Inclusion Dependency Experiment
        experiments.append((
            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
            labels_sig_schemes, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
            "inclusion_dependency_sig", "results/inclusion_dependency/"
        ))


    if experiment_config["reduction_runs"]:
        # Reduction Runs
        experiments.append((
            run_reduction_experiment, [0.7, 0.75, 0.8, 0.85], 0.0,
            labels_reduction, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
            "inclusion_dependency_reduction", "results/inclusion_dependency/"
        ))

    if experiment_config["scalability_runs"]:
        # Scalability Runs
        # String Matching
        experiments.append((
            run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.7, [1_000, 10_000, 100_000],
            source_string_matching[:100_000], None, similar, edit_similarity, False,
            "string_matching_scalability", "results/string_matching/"
        ))

        # Inclusion Dependency
        experiments.append((
            run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.5, [100_000, 200_000, 300_000, 400_000, 500_000],
            source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
            "inclusion_dependency_scalability", "results/inclusion_dependency/"
        ))

        # Schema Matching
        experiments.append((
            run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.0, [12_000, 24_000, 36_000, 48_000, 60_000],
            source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False,
            "schema_matching_scalability", "results/schema_matching/"
        ))

    if experiment_config["schema_github_webtable_runs"]:
        # Schema Matching with GitHub Webtable Schemas
        experiments.append((
            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
            labels_filter, source_sets_schema_matching[:10_000], github_source_sets_schema_matching[:10_000], similar, jaccard_similarity, True,
            "github_webtable_schema_matching", "results/schema_matching/"
        ))

    if experiment_config["inc_dep_without_silkmoth"]:
        experiments.append((
            run_matching_without_silkmoth_inc_dep, source_sets_in_dep[:500_000], reference_sets_in_dep[:200], [0.7, 0.75, 0.8, 0.85], 0.5, contain, jaccard_similarity,
            "raw_matching", "results/inclusion_dependency/"
        ))

    # Create and start processes for each experiment
    processes = []
    for experiment in experiments:
        method, *args = experiment
        process = multiprocessing.Process(target=run_experiment_multi, args=(method, *args))
        processes.append(process)
        process.start()

    # Wait for all processes to complete
    for process in processes:
        process.join()