init

Update README.md
2025-09-08 19:05:42 +02:00
commit 306fce9b53
153 changed files with 140241 additions and 0 deletions
--- a/frontend/pages/dataset_view.py
+++ b/frontend/pages/dataset_view.py
@@ -0,0 +1,64 @@
+import streamlit as st
+import pandas as pd
+import json
+import os
+
+# Directory containing the JSON files
+data_folder = "../experiments/data/webtables/"
+# JSON files to be used
+reference_file = "reference_sets_inclusion_dependency.json"
+source_file = "source_sets_inclusion_dependency.json"
+schema_matching_file = "webtable_schemas_sets_500k.json"
+
+# Full paths to the selected files
+reference_file_path = os.path.join(data_folder, reference_file)
+source_file_path = os.path.join(data_folder, source_file)
+schema_matching_file_path = os.path.join(data_folder, schema_matching_file)
+
+
+st.title("Datasets")
+st.divider()
+st.markdown(
+    """
+    <div style="background-color: #f9f9f9; padding: 20px; border-radius: 8px; border: 1px solid #ddd; line-height: 1.6;">
+    <p style="color: #333; font-size: 16px; margin: 0;">
+        This page provides an interactive interface to explore the datasets utilized in the SilkMoth Engine experiments. 
+        Please note that only a fraction of the data is displayed due to constraints.
+        We perform three types of experiments using two primary data sources:
+    </p>
+    <ul style="color: #555; font-size: 15px; margin-top: 10px; padding-left: 20px;">
+        <li><strong>Schema Matching Experiment:</strong> Utilizes 500,000 Webtable schemas for both the reference and source sets.</li>
+        <li><strong>Inclusion Dependency Experiment:</strong> Involves 500,000 Webtable columns in the source set, with 1,000 of these selected as the reference set.</li>
+        <li><strong>String Matching Experiment:</strong> Employs the DPLP dataset for matching tasks.</li>
+    </ul>
+</div>
+    """,
+    unsafe_allow_html=True,
+)
+st.divider()
+st.subheader("Schema Matching Dataset")
+
+# Load and display the schema matching dataset
+try:
+    with open(schema_matching_file_path, 'r', encoding='utf-8') as schema_file:
+        schema_data = json.load(schema_file)
+    schema_df = pd.DataFrame(schema_data).head(50)
+    st.dataframe(schema_df)
+except Exception as e:
+    st.error(f"Error loading schema matching dataset: {e}")
+
+
+st.divider()
+st.subheader("Inclusion Dependency Datasets")
+
+# Load and display the reference sets
+st.subheader("Reference/Source Sets")
+try:
+    with open(reference_file_path, 'r', encoding='utf-8') as ref_file:
+        reference_sets = json.load(ref_file)
+    ref_df = pd.DataFrame(reference_sets).head(50)
+    st.dataframe(ref_df)
+except Exception as e:
+    st.error(f"Error loading reference sets: {e}")
+
+
--- a/frontend/pages/inclusion_dependency_view.py
+++ b/frontend/pages/inclusion_dependency_view.py
@@ -0,0 +1,134 @@
+import random
+import time
+
+import streamlit as st
+
+from silkmoth.silkmoth_engine import SilkMothEngine
+from silkmoth.utils import jaccard_similarity, contain
+import os
+import json
+from utils import *
+
+
+# Streamlit app
+st.title("SilkMoth Engine Input Interface")
+st.divider()
+st.subheader("Inclusion Dependency Experiment")
+
+
+# Input fields for SilkMothEngine parameters
+# Allow the user to select the number of thresholds (up to 4)
+num_thresholds = st.number_input("Number of Thresholds", min_value=1, max_value=4, value=1, step=1)
+
+# Dynamically create sliders for the selected number of thresholds
+thresholds = []
+for i in range(num_thresholds):
+    threshold = st.slider(f"Threshold {i + 1}", 0.0, 1.0, 0.5, 0.05)
+    thresholds.append(threshold)
+
+
+# sim_thresh = st.slider("Similarity Threshold", 0.0, 1.0, 0.0, 0.05)
+check_filter = st.checkbox("Enable Check Filter", value=False)
+nn_filter = st.checkbox("Enable Nearest Neighbor Filter", value=False)
+
+# Directory containing the JSON files
+data_folder = "../experiments/data/webtables/"
+
+# JSON files to be used
+reference_file = "reference_sets_inclusion_dependency.json"
+source_file = "source_sets_inclusion_dependency.json"
+
+# Full paths to the selected files
+reference_file_path = os.path.join(data_folder, reference_file)
+source_file_path = os.path.join(data_folder, source_file)
+
+# Run the SilkMothEngine with progress animation and loading mask
+if st.button("Run SilkMoth Engine"):
+    if reference_file and source_file:
+        try:
+            # Create a placeholder for the loading animation
+            loading_placeholder = st.empty()
+            loading_placeholder.markdown("<div style='text-align: center; font-size: 20px;'>SilkMothEngine is running...</div>", unsafe_allow_html=True)
+
+            # Open and load reference and source sets from selected files
+            with open(reference_file_path, 'r', encoding='utf-8') as ref_file:
+                reference_sets = json.load(ref_file)
+            with open(source_file_path, 'r', encoding='utf-8') as src_file:
+                source_sets = json.load(src_file)
+
+
+
+            st.write(f"Create Inverted Index ...")
+            in_index_time_start = time.time()
+            # Initialize and run the SilkMothEngine
+            silk_moth_engine = SilkMothEngine(
+                related_thresh=0,
+                source_sets=source_sets,
+                sim_metric=contain,
+                sim_func=jaccard_similarity,
+                sim_thresh=0,
+                is_check_filter=False,
+                is_nn_filter=False,
+            )
+            in_index_time_end = time.time()
+            in_index_elapsed_time = in_index_time_end - in_index_time_start
+            st.write(f"Inverted Index created in {in_index_elapsed_time:.2f} seconds.")
+
+
+            elapsed_times_final = []
+            labels = ["NO FILTER"]
+            if check_filter:
+                labels.append("CHECK FILTER")
+
+            if nn_filter:
+                labels.append("NN FILTER")
+
+
+            for label in labels:
+                elapsed_times = []
+                for idx, related_thresh in enumerate(thresholds):
+
+                    if label == "CHECK FILTER":
+                        silk_moth_engine.is_check_filter = True
+                        silk_moth_engine.is_nn_filter = False
+                    elif label == "NN FILTER":
+                        silk_moth_engine.is_check_filter = False
+                        silk_moth_engine.is_nn_filter = True
+
+
+                    st.write(f"Processing Threshold {idx + 1}: {related_thresh} with {label} ...")
+                    silk_moth_engine.set_related_threshold(related_thresh)
+                    # Measure the time taken to search for related sets
+                    time_start = time.time()
+
+
+                    for ref_set in reference_sets:
+                        related_sets = silk_moth_engine.search_sets(ref_set)
+                        del related_sets
+
+                    time_end = time.time()
+
+
+                    elapsed_time = time_end - time_start
+                    elapsed_times.append(elapsed_time)
+
+                elapsed_times_final.append(elapsed_times)
+
+            # Remove the loading animation
+            loading_placeholder.empty()
+
+            # Display results
+            st.success("SilkMoth Engine ran successfully!")
+            fig = plot_elapsed_times(
+                related_thresholds=thresholds,
+                elapsed_times_list=elapsed_times_final,
+                fig_text="Inclusion Dependency (α = 0.0)",
+                legend_labels=labels,
+                file_name="webtable_inclusion_dependency_experiment_demo.png"
+            )
+            st.pyplot(fig)
+
+        except Exception as e:
+            st.error(f"An error occurred: {e}")
+    else:
+        st.warning("Please upload both reference and source set files.")
--- a/frontend/pages/what_is_silkmoth.py
+++ b/frontend/pages/what_is_silkmoth.py
@@ -0,0 +1,78 @@
+import streamlit as st
+
+st.title("What is SilkMoth?")
+st.markdown("""
+The **SilkMoth Engine** is a powerful framework designed for **efficiently discovering relationships and similarities among large collections of data sets.**
+
+It operates by:
+
+1.  **Treating each data collection as a "set"** comprised of unique "elements."
+2.  **Applying advanced similarity metrics and optimized algorithms** to compare these sets.
+3.  **Identifying "related" sets** based on a user-defined similarity threshold.
+
+This enables the rapid identification of connections within vast amounts of data, making it crucial for tasks like data organization, integration, and uncovering hidden insights.
+""")
+st.divider()
+st.title("🔁 Core Pipeline Steps")
+
+st.image("docs/figures/Pipeline.png", caption="Figure 1: SILKMOTH Framework Overview. Source: Deng et al., 'SILKMOTH: An Efficient Method for Finding Related Sets with Maximum Matching Constraints', VLDB 2017. Licensed under CC BY-NC-ND 4.0.")
+
+st.subheader("1. Tokenization")
+st.markdown("""
+Each element in every set is tokenized based on the selected similarity function:
+- **Jaccard Similarity**: Elements are split into whitespace-delimited tokens.
+- **Edit Similarity**: Elements are split into overlapping `q`-grams (e.g., 3-grams).
+""")
+
+st.subheader("2. Inverted Index Construction")
+st.markdown("""
+An **inverted index** is built from the reference set `R` to map each token to a list of `(set, element)` pairs in which it occurs. This allows fast lookup of candidate sets that share tokens with a query.
+""")
+
+st.subheader("3. Signature Generation")
+st.markdown("""
+A **signature** is a subset of tokens selected from each set such that:
+- Any related set must share at least one signature token.
+- Signature size is minimized to reduce candidate space.
+
+**Signature selection heuristics** (e.g., cost/value greedy ranking) are used to approximate the optimal valid signature, which is NP-complete to compute exactly.
+""")
+
+st.subheader("4. Candidate Selection")
+st.markdown("""
+For each set `R`, we retrieve from the inverted index all sets `S` that share at least one token with `R`’s signature. These become the **candidate sets** for further evaluation.
+""")
+
+st.subheader("5. Refinement Filters")
+st.markdown("""
+Two filters reduce false positives among the candidates:
+
+- **Check Filter**: Uses an upper bound on similarity to eliminate sets that cannot meet the threshold.
+- **Nearest Neighbor Filter**: Approximates the maximum matching score using the nearest neighbor similarity for each element in `R`.
+""")
+
+st.subheader("6. Verification via Maximum Matching")
+st.markdown("""
+For the remaining candidates, we compute the **maximum weighted bipartite matching** between elements of `R` and `S`, using the chosen similarity function as edge weights.
+
+Only sets whose matching score meets or exceeds a threshold `δ` are considered **related**.
+""")
+
+st.markdown("---")
+
+st.subheader("🧪 Modes of Operation")
+st.markdown("""
+- **Discovery Mode**: Compare all pairs of sets to find all related set pairs.  
+  **Use Case**: When you want to check which sets (e.g., columns in a database) are related to a specific reference set.
+- **Search Mode**: Given a reference set, find all sets related to it.  
+  **Use Case**: When you want to find all related set pairs in a dataset, for tasks like schema matching or entity deduplication.
+""")
+
+st.markdown("---")
+
+st.subheader("📐 Supported Similarity Functions")
+st.markdown("""
+- **Jaccard Similarity**
+- **Edit Similarity** (Levenshtein-based)
+- Optional **minimum similarity threshold** `α` can be enforced on element comparisons.
+""")