init

Update README.md
2025-09-08 19:05:42 +02:00
commit 306fce9b53
153 changed files with 140241 additions and 0 deletions
--- a/experiments/run.py
+++ b/experiments/run.py
@@ -0,0 +1,176 @@
+# Python
+import multiprocessing
+from experiments import run_experiment_filter_schemes, run_reduction_experiment, run_scalability_experiment, run_matching_without_silkmoth_inc_dep
+import os
+from data_loader import DataLoader
+from utils import load_sets_from_files
+from src.silkmoth.utils import jaccard_similarity, contain, similar, SigType, edit_similarity
+
+
+def run_experiment_multi(experiment_method, *args):
+    experiment_method(*args)
+
+
+if __name__ == "__main__":
+    data_loader = DataLoader("/")
+
+    # Labels for Filter Experiments
+    labels_filter = ["NO FILTER", "CHECK FILTER", "NN FILTER"]
+
+    # Labels for Signature Scheme
+    labels_sig_schemes = [SigType.WEIGHTED, SigType.SKYLINE, SigType.DICHOTOMY]
+
+    # Labels for Reduction
+    labels_reduction = ["REDUCTION", "NO REDUCTION"]
+
+    # Load the datasets for Experiments
+    data_path = os.path.join(os.path.dirname(__file__), "data", "dblp", "DBLP_100k.csv")
+    source_string_matching = data_loader.load_dblp_titles(data_path)
+    source_string_matching = [title.split() for title in source_string_matching]
+
+    try:
+        folder_path = os.path.join(os.path.dirname(__file__), "../experiments/data/webtables")
+        folder_path = os.path.normpath(folder_path)
+        reference_sets_in_dep, source_sets_in_dep = load_sets_from_files(
+            folder_path=folder_path,
+            reference_file="reference_sets_inclusion_dependency.json",
+            source_file="source_sets_inclusion_dependency.json"
+        )
+
+        reference_sets_schema_matching, source_sets_schema_matching = load_sets_from_files(
+            folder_path=folder_path,
+            reference_file="webtable_schemas_sets_500k.json",
+            source_file="webtable_schemas_sets_500k.json"
+        )
+        del reference_sets_schema_matching
+
+        _, github_source_sets_schema_matching = load_sets_from_files(
+            folder_path=folder_path,
+            reference_file="github_webtable_schemas_sets_500k.json",
+            source_file="github_webtable_schemas_sets_500k.json"
+        )
+
+    except FileNotFoundError:
+        print("Datasets not found. Skipping Experiments.")
+        reference_sets_in_dep, source_sets_in_dep, reference_sets_in_dep_reduction = [], [], []
+        source_sets_schema_matching = []
+        github_source_sets_schema_matching = []
+
+    # Experiment configuration
+    experiment_config = {
+        "filter_runs": False,
+        "signature_scheme_runs": False,
+        "reduction_runs": False,
+        "scalability_runs": False,
+        "schema_github_webtable_runs": False,
+        "inc_dep_without_silkmoth": True
+    }
+
+    # Define experiments to run
+    experiments = []
+
+    if experiment_config["filter_runs"]:
+        # Filter runs
+        # String Matching Experiment
+        experiments.append((
+            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.7, 0.75, 0.8, 0.85],
+            labels_filter, source_string_matching[:10_000], None, similar, edit_similarity , False,
+            "string_matching_filter", "results/string_matching/"
+        ))
+
+        # Schema Matching Experiment
+        experiments.append((
+            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
+            labels_filter, source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False,
+            "schema_matching_filter", "results/schema_matching/"
+        ))
+
+        # Inclusion Dependency Experiment
+        experiments.append((
+            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
+            labels_filter, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
+            "inclusion_dependency_filter", "results/inclusion_dependency/"
+        ))
+
+
+
+    if experiment_config["signature_scheme_runs"]:
+        # Signature Scheme Runs
+        #String Matching Experiment
+        experiments.append((
+            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.7, 0.75, 0.8, 0.85],
+            labels_sig_schemes, source_string_matching[:10_000], None, similar, edit_similarity , False,
+            "string_matching_sig", "results/string_matching/"
+        ))
+
+        # Schema Matching Experiment
+        experiments.append((
+            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
+            labels_sig_schemes, source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False,
+            "schema_matching_sig", "results/schema_matching/"
+        ))
+
+        # Inclusion Dependency Experiment
+        experiments.append((
+            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
+            labels_sig_schemes, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
+            "inclusion_dependency_sig", "results/inclusion_dependency/"
+        ))
+
+
+    if experiment_config["reduction_runs"]:
+        # Reduction Runs
+        experiments.append((
+            run_reduction_experiment, [0.7, 0.75, 0.8, 0.85], 0.0,
+            labels_reduction, source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
+            "inclusion_dependency_reduction", "results/inclusion_dependency/"
+        ))
+
+    if experiment_config["scalability_runs"]:
+        # Scalability Runs
+        # String Matching
+        experiments.append((
+            run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.7, [1_000, 10_000, 100_000],
+            source_string_matching[:100_000], None, similar, edit_similarity, False,
+            "string_matching_scalability", "results/string_matching/"
+        ))
+
+        # Inclusion Dependency
+        experiments.append((
+            run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.5, [100_000, 200_000, 300_000, 400_000, 500_000],
+            source_sets_in_dep, reference_sets_in_dep[:200], contain, jaccard_similarity, True,
+            "inclusion_dependency_scalability", "results/inclusion_dependency/"
+        ))
+
+        # Schema Matching
+        experiments.append((
+            run_scalability_experiment, [0.7, 0.75, 0.8, 0.85], 0.0, [12_000, 24_000, 36_000, 48_000, 60_000],
+            source_sets_schema_matching[:60_000], None, similar, jaccard_similarity, False,
+            "schema_matching_scalability", "results/schema_matching/"
+        ))
+
+    if experiment_config["schema_github_webtable_runs"]:
+        # Schema Matching with GitHub Webtable Schemas
+        experiments.append((
+            run_experiment_filter_schemes, [0.7, 0.75, 0.8, 0.85], [0.0, 0.25, 0.5, 0.75],
+            labels_filter, source_sets_schema_matching[:10_000], github_source_sets_schema_matching[:10_000], similar, jaccard_similarity, True,
+            "github_webtable_schema_matching", "results/schema_matching/"
+        ))
+
+    if experiment_config["inc_dep_without_silkmoth"]:
+        experiments.append((
+            run_matching_without_silkmoth_inc_dep, source_sets_in_dep[:500_000], reference_sets_in_dep[:200], [0.7, 0.75, 0.8, 0.85], 0.5, contain, jaccard_similarity,
+            "raw_matching", "results/inclusion_dependency/"
+        ))
+
+    # Create and start processes for each experiment
+    processes = []
+    for experiment in experiments:
+        method, *args = experiment
+        process = multiprocessing.Process(target=run_experiment_multi, args=(method, *args))
+        processes.append(process)
+        process.start()
+
+    # Wait for all processes to complete
+    for process in processes:
+        process.join()