Files
SilkMoth/experiments/experiments.py
Andreas Wilms d85c1c86df init
2025-09-08 19:05:42 +02:00

470 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
from math import floor
from silkmoth.silkmoth_engine import SilkMothEngine
from silkmoth.utils import SigType, edit_similarity, contain, jaccard_similarity
from silkmoth.verifier import Verifier
from silkmoth.tokenizer import Tokenizer
from src.silkmoth.silkmoth_engine import SilkMothEngine
from src.silkmoth.utils import SigType, edit_similarity
from utils import *
def run_experiment_filter_schemes(related_thresholds, similarity_thresholds, labels, source_sets, reference_sets,
sim_metric, sim_func, is_search, file_name_prefix, folder_path):
"""
Parameters
----------
related_thresholds : list[float]
Thresholds for determining relatedness between sets.
similarity_thresholds : list[float]
Thresholds for measuring similarity between sets.
labels : list[str]
Labels indicating the type of setting applied (e.g., "NO FILTER", "CHECK FILTER", "WEIGHTED").
source_sets : list[]
The sets to be compared against the reference sets or against itself.
reference_sets : list[]
The sets used as the reference for comparison.
sim_metric : callable
The metric function used to evaluate similarity between sets.
sim_func : callable
The function used to calculate similarity scores.
is_search : bool
Flag indicating whether to perform a search operation or discovery.
file_name_prefix : str
Prefix for naming output files generated during the experiment.
folder_path: str
Path to the folder where results will be saved.
"""
# Calculate index time and RAM usage for the SilkMothEngine
in_index_time_start = time.time()
initial_ram = measure_ram_usage()
# Initialize and run the SilkMothEngine
silk_moth_engine = SilkMothEngine(
related_thresh=0,
source_sets=source_sets,
sim_metric=sim_metric,
sim_func=sim_func,
sim_thresh=0,
is_check_filter=False,
is_nn_filter=False,
)
in_index_time_end = time.time()
final_ram = measure_ram_usage()
in_index_elapsed_time = in_index_time_end - in_index_time_start
in_index_ram_usage = final_ram - initial_ram
print(f"Inverted Index created in {in_index_elapsed_time:.2f} seconds.")
for sim_thresh in similarity_thresholds:
# Check if the similarity function is edit similarity
if sim_func == edit_similarity:
# calc the maximum possible q-gram size based on sim_thresh
upper_bound_q = sim_thresh/(1 - sim_thresh)
q = floor(upper_bound_q)
print(f"Using q = {q} for edit similarity with sim_thresh = {sim_thresh}")
print(f"Rebuilding Inverted Index with q = {q}...")
silk_moth_engine.set_q(q)
elapsed_times_final = []
silk_moth_engine.set_alpha(sim_thresh)
for label in labels:
elapsed_times = []
for idx, related_thresh in enumerate(related_thresholds):
print(
f"\nRunning SilkMoth {file_name_prefix} with α = {sim_thresh}, θ = {related_thresh}, label = {label}")
# checks for filter runs
if label == "CHECK FILTER":
silk_moth_engine.is_check_filter = True
silk_moth_engine.is_nn_filter = False
elif label == "NN FILTER":
silk_moth_engine.is_check_filter = False
silk_moth_engine.is_nn_filter = True
else: # NO FILTER
silk_moth_engine.is_check_filter = False
silk_moth_engine.is_nn_filter = False
# checks for signature scheme runs
if label == SigType.WEIGHTED:
silk_moth_engine.set_signature_type(SigType.WEIGHTED)
elif label == SigType.SKYLINE:
silk_moth_engine.set_signature_type(SigType.SKYLINE)
elif label == SigType.DICHOTOMY:
silk_moth_engine.set_signature_type(SigType.DICHOTOMY)
silk_moth_engine.set_related_threshold(related_thresh)
# Measure the time taken to search for related sets
time_start = time.time()
# Used for search to see how many candidates were found and how many were removed
candidates_amount = 0
candidates_after = 0
related_sets_found = 0
if is_search:
for ref_id, ref_set in enumerate(reference_sets):
related_sets_temp, candidates_amount_temp, candidates_removed_temp = silk_moth_engine.search_sets(
ref_set)
candidates_amount += candidates_amount_temp
candidates_after += candidates_removed_temp
related_sets_found += len(related_sets_temp)
else:
# If not searching, we are discovering sets
silk_moth_engine.discover_sets(source_sets)
time_end = time.time()
elapsed_time = time_end - time_start
elapsed_times.append(elapsed_time)
# Create a new data dictionary for each iteration
if is_search:
data_overall = {
"similarity_threshold": sim_thresh,
"related_threshold": related_thresh,
"reference_set_amount": len(reference_sets),
"source_set_amount": len(source_sets),
"label": label,
"elapsed_time": round(elapsed_time, 3),
"inverted_index_time": round(in_index_elapsed_time, 3),
"inverted_index_ram_usage": round(in_index_ram_usage, 3),
"candidates_amount": candidates_amount,
"candidates_amount_after_filtering": candidates_after,
"related_sets_found": related_sets_found,
}
else:
data_overall = {
"similarity_threshold": sim_thresh,
"related_threshold": related_thresh,
"source_set_amount": len(source_sets),
"label": label,
"elapsed_time": round(elapsed_time, 3),
"inverted_index_time": round(in_index_elapsed_time, 3),
"inverted_index_ram_usage": round(in_index_ram_usage, 3),
}
# Save results to a CSV file
save_experiment_results_to_csv(
results=data_overall,
file_name=f"{folder_path}{file_name_prefix}_experiment_results.csv"
)
elapsed_times_final.append(elapsed_times)
_ = plot_elapsed_times(
related_thresholds=related_thresholds,
elapsed_times_list=elapsed_times_final,
fig_text=f"{file_name_prefix} (α = {sim_thresh})",
legend_labels=labels,
file_name=f"{folder_path}{file_name_prefix}_experiment_α={sim_thresh}.png"
)
def run_reduction_experiment(related_thresholds, similarity_threshold, labels, source_sets, reference_sets,
sim_metric, sim_func, is_search, file_name_prefix, folder_path):
"""
Parameters
----------
related_thresholds : list[float]
Thresholds for determining relatedness between sets.
similarity_threshold : float
Thresholds for measuring similarity between sets.
labels : list[str]
Labels indicating the type of setting applied (e.g., "NO FILTER", "CHECK FILTER", "WEIGHTED").
source_sets : list[]
The sets to be compared against the reference sets or against itself.
reference_sets : list[]
The sets used as the reference for comparison.
sim_metric : callable
The metric function used to evaluate similarity between sets.
sim_func : callable
The function used to calculate similarity scores.
is_search : bool
Flag indicating whether to perform a search operation or discovery.
file_name_prefix : str
Prefix for naming output files generated during the experiment.
folder_path: str
Path to the folder where results will be saved.
"""
in_index_time_start = time.time()
initial_ram = measure_ram_usage()
# Initialize and run the SilkMothEngine
silk_moth_engine = SilkMothEngine(
related_thresh=0,
source_sets=source_sets,
sim_metric=sim_metric,
sim_func=sim_func,
sim_thresh=similarity_threshold,
is_check_filter=False,
is_nn_filter=False,
)
# use dichotomy signature scheme for this experiment
silk_moth_engine.set_signature_type(SigType.DICHOTOMY)
in_index_time_end = time.time()
final_ram = measure_ram_usage()
in_index_elapsed_time = in_index_time_end - in_index_time_start
in_index_ram_usage = final_ram - initial_ram
print(f"Inverted Index created in {in_index_elapsed_time:.2f} seconds.")
elapsed_times_final = []
for label in labels:
if label == "REDUCTION":
silk_moth_engine.set_reduction(True)
elif label == "NO REDUCTION":
silk_moth_engine.set_reduction(False)
elapsed_times = []
for idx, related_thresh in enumerate(related_thresholds):
print(
f"\nRunning SilkMoth {file_name_prefix} with α = {similarity_threshold}, θ = {related_thresh}, label = {label}")
silk_moth_engine.set_related_threshold(related_thresh)
# Measure the time taken to search for related sets
time_start = time.time()
# Used for search to see how many candidates were found and how many were removed
candidates_amount = 0
candidates_after = 0
if is_search:
for ref_id, ref_set in enumerate(reference_sets):
related_sets_temp, candidates_amount_temp, candidates_removed_temp = silk_moth_engine.search_sets(
ref_set)
candidates_amount += candidates_amount_temp
candidates_after += candidates_removed_temp
else:
# If not searching, we are discovering sets
silk_moth_engine.discover_sets(source_sets)
time_end = time.time()
elapsed_time = time_end - time_start
elapsed_times.append(elapsed_time)
# Create a new data dictionary for each iteration
if is_search:
data_overall = {
"similarity_threshold": similarity_threshold,
"related_threshold": related_thresh,
"reference_set_amount": len(reference_sets),
"source_set_amount": len(source_sets),
"label": label,
"elapsed_time": round(elapsed_time, 3),
"inverted_index_time": round(in_index_elapsed_time, 3),
"inverted_index_ram_usage": round(in_index_ram_usage, 3),
"candidates_amount": candidates_amount,
"candidates_amount_after_filtering": candidates_after,
}
else:
data_overall = {
"similarity_threshold": similarity_threshold,
"related_threshold": related_thresh,
"source_set_amount": len(source_sets),
"label": label,
"elapsed_time": round(elapsed_time, 3),
"inverted_index_time": round(in_index_elapsed_time, 3),
"inverted_index_ram_usage": round(in_index_ram_usage, 3),
}
# Save results to a CSV file
save_experiment_results_to_csv(
results=data_overall,
file_name=f"{folder_path}{file_name_prefix}_experiment_results.csv"
)
elapsed_times_final.append(elapsed_times)
_ = plot_elapsed_times(
related_thresholds=related_thresholds,
elapsed_times_list=elapsed_times_final,
fig_text=f"{file_name_prefix} (α = {similarity_threshold})",
legend_labels=labels,
file_name=f"{folder_path}{file_name_prefix}_experiment_α={similarity_threshold}.png"
)
def run_scalability_experiment(related_thresholds, similarity_threshold, set_sizes, source_sets, reference_sets,
sim_metric, sim_func, is_search, file_name_prefix, folder_path):
"""
Parameters
----------
related_thresholds : list[float]
Thresholds for determining relatedness between sets.
similarity_threshold : float
Thresholds for measuring similarity between sets.
set_sizes : list[int]
Sizes of the sets to be used in the experiment.
source_sets : list[]
The sets to be compared against the reference sets or against itself.
reference_sets : list[]
The sets used as the reference for comparison.
sim_metric : callable
The metric function used to evaluate similarity between sets.
sim_func : callable
The function used to calculate similarity scores.
is_search : bool
Flag indicating whether to perform a search operation or discovery.
file_name_prefix : str
Prefix for naming output files generated during the experiment.
folder_path: str
Path to the folder where results will be saved.
"""
elapsed_times_final = []
for idx, related_thresh in enumerate(related_thresholds):
elapsed_times = []
for size in set_sizes:
in_index_time_start = time.time()
initial_ram = measure_ram_usage()
# Initialize and run the SilkMothEngine
silk_moth_engine = SilkMothEngine(
related_thresh=0,
source_sets=source_sets[:size],
sim_metric=sim_metric,
sim_func=sim_func,
sim_thresh=similarity_threshold,
is_check_filter=True,
is_nn_filter=True,
)
in_index_time_end = time.time()
final_ram = measure_ram_usage()
in_index_elapsed_time = in_index_time_end - in_index_time_start
in_index_ram_usage = final_ram - initial_ram
print(f"Inverted Index created in {in_index_elapsed_time:.2f} seconds.")
print(
f"\nRunning SilkMoth {file_name_prefix} with α = {similarity_threshold}, θ = {related_thresh}, set_size = {size}")
silk_moth_engine.set_related_threshold(related_thresh)
# Measure the time taken to search for related sets
time_start = time.time()
if sim_func == edit_similarity:
# calc the maximum possible q-gram size based on sim_thresh
upper_bound_q = similarity_threshold / (1 - similarity_threshold)
q = floor(upper_bound_q)
print(f"Using q = {q} for edit similarity with sim_thresh = {similarity_threshold}")
print(f"Rebuilding Inverted Index with q = {q}...")
silk_moth_engine.set_q(q)
# Used for search to see how many candidates were found and how many were removed
candidates_amount = 0
candidates_after = 0
if is_search:
for ref_id, ref_set in enumerate(reference_sets):
related_sets_temp, candidates_amount_temp, candidates_removed_temp = silk_moth_engine.search_sets(
ref_set)
candidates_amount += candidates_amount_temp
candidates_after += candidates_removed_temp
else:
# If not searching, we are discovering sets
silk_moth_engine.discover_sets(source_sets[:size])
time_end = time.time()
elapsed_time = time_end - time_start
elapsed_times.append(elapsed_time)
# Create a new data dictionary for each iteration
if is_search:
data_overall = {
"similarity_threshold": similarity_threshold,
"related_threshold": related_thresh,
"reference_set_amount": len(reference_sets),
"source_set_amount": len(source_sets[:size]),
"set_size": size,
"elapsed_time": round(elapsed_time, 3),
"inverted_index_time": round(in_index_elapsed_time, 3),
"inverted_index_ram_usage": round(in_index_ram_usage, 3),
"candidates_amount": candidates_amount,
"candidates_amount_after_filtering": candidates_after,
}
else:
data_overall = {
"similarity_threshold": similarity_threshold,
"related_threshold": related_thresh,
"source_set_amount": len(source_sets[:size]),
"set_size": size,
"elapsed_time": round(elapsed_time, 3),
"inverted_index_time": round(in_index_elapsed_time, 3),
"inverted_index_ram_usage": round(in_index_ram_usage, 3),
}
# Save results to a CSV file
save_experiment_results_to_csv(
results=data_overall,
file_name=f"{folder_path}{file_name_prefix}_experiment_results.csv"
)
del silk_moth_engine
elapsed_times_final.append(elapsed_times)
# create legend labels based on set sizes
adjusted_legend_labels = [f"θ = {rt}" for rt in related_thresholds]
adjusted_set_sizes = [size / 100_000 for size in set_sizes]
_ = plot_elapsed_times(
related_thresholds=adjusted_set_sizes,
elapsed_times_list=elapsed_times_final,
fig_text=f"{file_name_prefix} (α = {similarity_threshold})",
legend_labels=adjusted_legend_labels,
file_name=f"{folder_path}{file_name_prefix}_experiment_α={similarity_threshold}.png",
xlabel="Number of Sets (in 100ks)",
)
def run_matching_without_silkmoth_inc_dep(source_sets, reference_sets, related_thresholds, similarity_threshold, sim_metric, sim_fun , file_name_prefix, folder_path):
tokenizer = Tokenizer(sim_func=sim_fun)
for related_thresh in related_thresholds:
verifier = Verifier(sim_thresh=similarity_threshold, related_thresh=related_thresh,
sim_metric=sim_metric, sim_func=sim_fun, reduction=False)
related_sets = []
time_start = time.time()
for ref in reference_sets:
for source in source_sets:
if len(ref) > len(source):
continue
relatedness = verifier.get_relatedness(tokenizer.tokenize(ref), tokenizer.tokenize(source))
if relatedness >= related_thresh:
related_sets.append((source, relatedness))
time_end = time.time()
elapsed_time = time_end - time_start
data_overall = {
"similarity_threshold": similarity_threshold,
"related_threshold": related_thresh,
"source_set_amount": len(source_sets),
"reference_set_amount": len(reference_sets),
"label": "RAW MATCH",
"elapsed_time": round(elapsed_time, 3),
"matches_found": len(related_sets)
}
# Save results to a CSV file
save_experiment_results_to_csv(
results=data_overall,
file_name=f"{folder_path}{file_name_prefix}_experiment_results.csv"
)