Files
SilkMoth/experiments/run.py
Andreas Wilms d85c1c86df init
2025-09-08 19:05:42 +02:00

177 lines
7.4 KiB
Python

# Python
import multiprocessing
from experiments import run_experiment_filter_schemes, run_reduction_experiment, run_scalability_experiment, run_matching_without_silkmoth_inc_dep
import os
from data_loader import DataLoader
from utils import load_sets_from_files
from src.silkmoth.utils import jaccard_similarity, contain, similar, SigType, edit_similarity
def run_experiment_multi(experiment_method, *args):
experiment_method(*args)
if __name__ == "__main__":
data_loader = DataLoader("/")
# Labels for Filter Experiments
labels_filter = ["NO FILTER", "CHECK FILTER", "NN FILTER"]
# Labels for Signature Scheme
labels_sig_schemes = [SigType.WEIGHTED, SigType.SKYLINE, SigType.DICHOTOMY]
# Labels for Reduction
labels_reduction = ["REDUCTION", "NO REDUCTION"]
# Load the datasets for Experiments
data_path = os.path.join(os.path.dirname(__file__), "data", "dblp", "DBLP_100k.csv")
source_string_matching = data_loader.load_dblp_titles(data_path)
source_string_matching = [title.split() for title in source_string_matching]
try:
folder_path = os.path.join(os.path.dirname(__file__), "../experiments/data/webtables")
folder_path = os.path.normpath(folder_path)
reference_sets_in_dep, source_sets_in_dep = load_sets_from_files(
folder_path=folder_path,
reference_file="reference_sets_inclusion_dependency.json",
source_file="source_sets_inclusion_dependency.json"
)
reference_sets_schema_matching, source_sets_schema_matching = load_sets_from_files(
folder_path=folder_path,
reference_file="webtable_schemas_sets_500k.json",
source_file="webtable_schemas_sets_500k.json"
)
del reference_sets_schema_matching
_, github_source_sets_schema_matching = load_sets_from_files(
folder_path=folder_path,
reference_file="github_webtable_schemas_sets_500k.json",
source_file="github_webtable_schemas_sets_500k.json"
)
except FileNotFoundError:
print("Datasets not found. Skipping Experiments.")
reference_sets_in_dep, source_sets_in_dep, reference_sets_in_dep_reduction = [], [], []
source_sets_schema_matching = []
github_source_sets_schema_matching = []
# Experiment configuration
experiment_config = {
"filter_runs": False,
"signature_scheme_runs": False,
"reduction_runs": False,
"scalability_runs": False,
"schema_github_webtable_runs": False,
"inc_dep_without_silkmoth": True
}
# Define experiments to run
experiments = []
if experiment_config["filter_runs"]:
# Filter runs
# String Matching Experiment
experiments.append((
run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.7, 0.75, 0.8, 0.85],
labels_filter, source_string_matching[:10_000], None, similar, edit_similarity , False,
"string_matching_filter", "results/string_matching/"
))
# Schema Matching Experiment
experiments.append((
run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
labels_filter, source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False,
"schema_matching_filter", "results/schema_matching/"
))
# Inclusion Dependency Experiment
experiments.append((
run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
labels_filter, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
"inclusion_dependency_filter", "results/inclusion_dependency/"
))
if experiment_config["signature_scheme_runs"]:
# Signature Scheme Runs
#String Matching Experiment
experiments.append((
run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.7, 0.75, 0.8, 0.85],
labels_sig_schemes, source_string_matching[:10_000], None, similar, edit_similarity , False,
"string_matching_sig", "results/string_matching/"
))
# Schema Matching Experiment
experiments.append((
run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
labels_sig_schemes, source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False,
"schema_matching_sig", "results/schema_matching/"
))
# Inclusion Dependency Experiment
experiments.append((
run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
labels_sig_schemes, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
"inclusion_dependency_sig", "results/inclusion_dependency/"
))
if experiment_config["reduction_runs"]:
# Reduction Runs
experiments.append((
run_reduction_experiment, [0.7, 0.75, 0.8, 0.85], 0.0,
labels_reduction, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
"inclusion_dependency_reduction", "results/inclusion_dependency/"
))
if experiment_config["scalability_runs"]:
# Scalability Runs
# String Matching
experiments.append((
run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.7, [1_000, 10_000, 100_000],
source_string_matching[:100_000], None, similar, edit_similarity, False,
"string_matching_scalability", "results/string_matching/"
))
# Inclusion Dependency
experiments.append((
run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.5, [100_000, 200_000, 300_000, 400_000, 500_000],
source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
"inclusion_dependency_scalability", "results/inclusion_dependency/"
))
# Schema Matching
experiments.append((
run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.0, [12_000, 24_000, 36_000, 48_000, 60_000],
source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False,
"schema_matching_scalability", "results/schema_matching/"
))
if experiment_config["schema_github_webtable_runs"]:
# Schema Matching with GitHub Webtable Schemas
experiments.append((
run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
labels_filter, source_sets_schema_matching[:10_000], github_source_sets_schema_matching[:10_000], similar, jaccard_similarity, True,
"github_webtable_schema_matching", "results/schema_matching/"
))
if experiment_config["inc_dep_without_silkmoth"]:
experiments.append((
run_matching_without_silkmoth_inc_dep, source_sets_in_dep[:500_000], reference_sets_in_dep[:200], [0.7, 0.75, 0.8, 0.85], 0.5, contain, jaccard_similarity,
"raw_matching", "results/inclusion_dependency/"
))
# Create and start processes for each experiment
processes = []
for experiment in experiments:
method, *args = experiment
process = multiprocessing.Process(target=run_experiment_multi, args=(method, *args))
processes.append(process)
process.start()
# Wait for all processes to complete
for process in processes:
process.join()