init
This commit is contained in:
66
frontend/README.md
Normal file
66
frontend/README.md
Normal file
@@ -0,0 +1,66 @@
|
||||
# 🦋 SilkMoth Frontend
|
||||
|
||||
This is the **frontend** for the **SilkMoth** project, built with the [Streamlit](https://streamlit.io/) framework to provide an interactive web interface.
|
||||
|
||||
---
|
||||
|
||||
## ⚙️ Requirements
|
||||
|
||||
- [Python](https://www.python.org/) installed on your system
|
||||
- `venv` module for virtual environments
|
||||
- Project data available in the `experiment/data/` folder
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Setup
|
||||
|
||||
Two setup scripts are provided—one for **Windows** and another for **Linux/macOS**. These scripts will:
|
||||
|
||||
1. Create a virtual environment
|
||||
2. Install all necessary Python dependencies
|
||||
3. Launch the frontend application
|
||||
|
||||
### 🪟 Windows
|
||||
|
||||
Open a terminal in the `frontend` directory and run:
|
||||
|
||||
```bash
|
||||
.\setup_win.bat
|
||||
```
|
||||
---
|
||||
|
||||
### 🐧 Linux / 🍎 macOS
|
||||
|
||||
Open a terminal in the `frontend` directory and run:
|
||||
|
||||
```bash
|
||||
./setup_unix.sh
|
||||
|
||||
```
|
||||
---
|
||||
|
||||
## ▶️ Usage
|
||||
|
||||
Once the setup is complete, follow these steps to run the frontend manually:
|
||||
|
||||
### 1. Activate the virtual environment
|
||||
|
||||
#### 🪟 Windows
|
||||
|
||||
```powershell
|
||||
.\.venv\Scripts\Activate.ps1
|
||||
```
|
||||
|
||||
### 🐧 Linux / 🍎 macOS
|
||||
|
||||
Activate the virtual environment:
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate
|
||||
|
||||
```
|
||||
### 2. Run the Streamlit app
|
||||
|
||||
```bash
|
||||
streamlit run app.py
|
||||
```
|
||||
13
frontend/app.py
Normal file
13
frontend/app.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import streamlit as st
|
||||
|
||||
pages = {
|
||||
"SilkMoth": [
|
||||
st.Page("pages/what_is_silkmoth.py", title="What is SilkMoth?"),
|
||||
st.Page("pages/inclusion_dependency_view.py", title="Inclusion Dependency Experiment"),
|
||||
st.Page("pages/dataset_view.py", title="Our Datasets"),
|
||||
],
|
||||
|
||||
}
|
||||
|
||||
pg = st.navigation(pages)
|
||||
pg.run()
|
||||
BIN
frontend/docs/figures/Pipeline.png
Normal file
BIN
frontend/docs/figures/Pipeline.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 230 KiB |
64
frontend/pages/dataset_view.py
Normal file
64
frontend/pages/dataset_view.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import streamlit as st
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
|
||||
# Directory containing the JSON files
|
||||
data_folder = "../experiments/data/webtables/"
|
||||
# JSON files to be used
|
||||
reference_file = "reference_sets_inclusion_dependency.json"
|
||||
source_file = "source_sets_inclusion_dependency.json"
|
||||
schema_matching_file = "webtable_schemas_sets_500k.json"
|
||||
|
||||
# Full paths to the selected files
|
||||
reference_file_path = os.path.join(data_folder, reference_file)
|
||||
source_file_path = os.path.join(data_folder, source_file)
|
||||
schema_matching_file_path = os.path.join(data_folder, schema_matching_file)
|
||||
|
||||
|
||||
st.title("Datasets")
|
||||
st.divider()
|
||||
st.markdown(
|
||||
"""
|
||||
<div style="background-color: #f9f9f9; padding: 20px; border-radius: 8px; border: 1px solid #ddd; line-height: 1.6;">
|
||||
<p style="color: #333; font-size: 16px; margin: 0;">
|
||||
This page provides an interactive interface to explore the datasets utilized in the SilkMoth Engine experiments.
|
||||
Please note that only a fraction of the data is displayed due to constraints.
|
||||
We perform three types of experiments using two primary data sources:
|
||||
</p>
|
||||
<ul style="color: #555; font-size: 15px; margin-top: 10px; padding-left: 20px;">
|
||||
<li><strong>Schema Matching Experiment:</strong> Utilizes 500,000 Webtable schemas for both the reference and source sets.</li>
|
||||
<li><strong>Inclusion Dependency Experiment:</strong> Involves 500,000 Webtable columns in the source set, with 1,000 of these selected as the reference set.</li>
|
||||
<li><strong>String Matching Experiment:</strong> Employs the DPLP dataset for matching tasks.</li>
|
||||
</ul>
|
||||
</div>
|
||||
""",
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
st.divider()
|
||||
st.subheader("Schema Matching Dataset")
|
||||
|
||||
# Load and display the schema matching dataset
|
||||
try:
|
||||
with open(schema_matching_file_path, 'r', encoding='utf-8') as schema_file:
|
||||
schema_data = json.load(schema_file)
|
||||
schema_df = pd.DataFrame(schema_data).head(50)
|
||||
st.dataframe(schema_df)
|
||||
except Exception as e:
|
||||
st.error(f"Error loading schema matching dataset: {e}")
|
||||
|
||||
|
||||
st.divider()
|
||||
st.subheader("Inclusion Dependency Datasets")
|
||||
|
||||
# Load and display the reference sets
|
||||
st.subheader("Reference/Source Sets")
|
||||
try:
|
||||
with open(reference_file_path, 'r', encoding='utf-8') as ref_file:
|
||||
reference_sets = json.load(ref_file)
|
||||
ref_df = pd.DataFrame(reference_sets).head(50)
|
||||
st.dataframe(ref_df)
|
||||
except Exception as e:
|
||||
st.error(f"Error loading reference sets: {e}")
|
||||
|
||||
|
||||
134
frontend/pages/inclusion_dependency_view.py
Normal file
134
frontend/pages/inclusion_dependency_view.py
Normal file
@@ -0,0 +1,134 @@
|
||||
import random
|
||||
import time
|
||||
|
||||
import streamlit as st
|
||||
|
||||
from silkmoth.silkmoth_engine import SilkMothEngine
|
||||
from silkmoth.utils import jaccard_similarity, contain
|
||||
import os
|
||||
import json
|
||||
from utils import *
|
||||
|
||||
|
||||
# Streamlit app
|
||||
st.title("SilkMoth Engine Input Interface")
|
||||
st.divider()
|
||||
st.subheader("Inclusion Dependency Experiment")
|
||||
|
||||
|
||||
# Input fields for SilkMothEngine parameters
|
||||
# Allow the user to select the number of thresholds (up to 4)
|
||||
num_thresholds = st.number_input("Number of Thresholds", min_value=1, max_value=4, value=1, step=1)
|
||||
|
||||
# Dynamically create sliders for the selected number of thresholds
|
||||
thresholds = []
|
||||
for i in range(num_thresholds):
|
||||
threshold = st.slider(f"Threshold {i + 1}", 0.0, 1.0, 0.5, 0.05)
|
||||
thresholds.append(threshold)
|
||||
|
||||
|
||||
# sim_thresh = st.slider("Similarity Threshold", 0.0, 1.0, 0.0, 0.05)
|
||||
check_filter = st.checkbox("Enable Check Filter", value=False)
|
||||
nn_filter = st.checkbox("Enable Nearest Neighbor Filter", value=False)
|
||||
|
||||
# Directory containing the JSON files
|
||||
data_folder = "../experiments/data/webtables/"
|
||||
|
||||
# JSON files to be used
|
||||
reference_file = "reference_sets_inclusion_dependency.json"
|
||||
source_file = "source_sets_inclusion_dependency.json"
|
||||
|
||||
# Full paths to the selected files
|
||||
reference_file_path = os.path.join(data_folder, reference_file)
|
||||
source_file_path = os.path.join(data_folder, source_file)
|
||||
|
||||
# Run the SilkMothEngine with progress animation and loading mask
|
||||
if st.button("Run SilkMoth Engine"):
|
||||
if reference_file and source_file:
|
||||
try:
|
||||
# Create a placeholder for the loading animation
|
||||
loading_placeholder = st.empty()
|
||||
loading_placeholder.markdown("<div style='text-align: center; font-size: 20px;'>SilkMothEngine is running...</div>", unsafe_allow_html=True)
|
||||
|
||||
# Open and load reference and source sets from selected files
|
||||
with open(reference_file_path, 'r', encoding='utf-8') as ref_file:
|
||||
reference_sets = json.load(ref_file)
|
||||
with open(source_file_path, 'r', encoding='utf-8') as src_file:
|
||||
source_sets = json.load(src_file)
|
||||
|
||||
|
||||
|
||||
st.write(f"Create Inverted Index ...")
|
||||
in_index_time_start = time.time()
|
||||
# Initialize and run the SilkMothEngine
|
||||
silk_moth_engine = SilkMothEngine(
|
||||
related_thresh=0,
|
||||
source_sets=source_sets,
|
||||
sim_metric=contain,
|
||||
sim_func=jaccard_similarity,
|
||||
sim_thresh=0,
|
||||
is_check_filter=False,
|
||||
is_nn_filter=False,
|
||||
)
|
||||
in_index_time_end = time.time()
|
||||
in_index_elapsed_time = in_index_time_end - in_index_time_start
|
||||
st.write(f"Inverted Index created in {in_index_elapsed_time:.2f} seconds.")
|
||||
|
||||
|
||||
elapsed_times_final = []
|
||||
labels = ["NO FILTER"]
|
||||
if check_filter:
|
||||
labels.append("CHECK FILTER")
|
||||
|
||||
if nn_filter:
|
||||
labels.append("NN FILTER")
|
||||
|
||||
|
||||
for label in labels:
|
||||
elapsed_times = []
|
||||
for idx, related_thresh in enumerate(thresholds):
|
||||
|
||||
if label == "CHECK FILTER":
|
||||
silk_moth_engine.is_check_filter = True
|
||||
silk_moth_engine.is_nn_filter = False
|
||||
elif label == "NN FILTER":
|
||||
silk_moth_engine.is_check_filter = False
|
||||
silk_moth_engine.is_nn_filter = True
|
||||
|
||||
|
||||
st.write(f"Processing Threshold {idx + 1}: {related_thresh} with {label} ...")
|
||||
silk_moth_engine.set_related_threshold(related_thresh)
|
||||
# Measure the time taken to search for related sets
|
||||
time_start = time.time()
|
||||
|
||||
|
||||
for ref_set in reference_sets:
|
||||
related_sets = silk_moth_engine.search_sets(ref_set)
|
||||
del related_sets
|
||||
|
||||
time_end = time.time()
|
||||
|
||||
|
||||
elapsed_time = time_end - time_start
|
||||
elapsed_times.append(elapsed_time)
|
||||
|
||||
elapsed_times_final.append(elapsed_times)
|
||||
|
||||
# Remove the loading animation
|
||||
loading_placeholder.empty()
|
||||
|
||||
# Display results
|
||||
st.success("SilkMoth Engine ran successfully!")
|
||||
fig = plot_elapsed_times(
|
||||
related_thresholds=thresholds,
|
||||
elapsed_times_list=elapsed_times_final,
|
||||
fig_text="Inclusion Dependency (α = 0.0)",
|
||||
legend_labels=labels,
|
||||
file_name="webtable_inclusion_dependency_experiment_demo.png"
|
||||
)
|
||||
st.pyplot(fig)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"An error occurred: {e}")
|
||||
else:
|
||||
st.warning("Please upload both reference and source set files.")
|
||||
78
frontend/pages/what_is_silkmoth.py
Normal file
78
frontend/pages/what_is_silkmoth.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import streamlit as st
|
||||
|
||||
st.title("What is SilkMoth?")
|
||||
st.markdown("""
|
||||
The **SilkMoth Engine** is a powerful framework designed for **efficiently discovering relationships and similarities among large collections of data sets.**
|
||||
|
||||
It operates by:
|
||||
|
||||
1. **Treating each data collection as a "set"** comprised of unique "elements."
|
||||
2. **Applying advanced similarity metrics and optimized algorithms** to compare these sets.
|
||||
3. **Identifying "related" sets** based on a user-defined similarity threshold.
|
||||
|
||||
This enables the rapid identification of connections within vast amounts of data, making it crucial for tasks like data organization, integration, and uncovering hidden insights.
|
||||
""")
|
||||
st.divider()
|
||||
st.title("🔁 Core Pipeline Steps")
|
||||
|
||||
st.image("docs/figures/Pipeline.png", caption="Figure 1: SILKMOTH Framework Overview. Source: Deng et al., 'SILKMOTH: An Efficient Method for Finding Related Sets with Maximum Matching Constraints', VLDB 2017. Licensed under CC BY-NC-ND 4.0.")
|
||||
|
||||
st.subheader("1. Tokenization")
|
||||
st.markdown("""
|
||||
Each element in every set is tokenized based on the selected similarity function:
|
||||
- **Jaccard Similarity**: Elements are split into whitespace-delimited tokens.
|
||||
- **Edit Similarity**: Elements are split into overlapping `q`-grams (e.g., 3-grams).
|
||||
""")
|
||||
|
||||
st.subheader("2. Inverted Index Construction")
|
||||
st.markdown("""
|
||||
An **inverted index** is built from the reference set `R` to map each token to a list of `(set, element)` pairs in which it occurs. This allows fast lookup of candidate sets that share tokens with a query.
|
||||
""")
|
||||
|
||||
st.subheader("3. Signature Generation")
|
||||
st.markdown("""
|
||||
A **signature** is a subset of tokens selected from each set such that:
|
||||
- Any related set must share at least one signature token.
|
||||
- Signature size is minimized to reduce candidate space.
|
||||
|
||||
**Signature selection heuristics** (e.g., cost/value greedy ranking) are used to approximate the optimal valid signature, which is NP-complete to compute exactly.
|
||||
""")
|
||||
|
||||
st.subheader("4. Candidate Selection")
|
||||
st.markdown("""
|
||||
For each set `R`, we retrieve from the inverted index all sets `S` that share at least one token with `R`’s signature. These become the **candidate sets** for further evaluation.
|
||||
""")
|
||||
|
||||
st.subheader("5. Refinement Filters")
|
||||
st.markdown("""
|
||||
Two filters reduce false positives among the candidates:
|
||||
|
||||
- **Check Filter**: Uses an upper bound on similarity to eliminate sets that cannot meet the threshold.
|
||||
- **Nearest Neighbor Filter**: Approximates the maximum matching score using the nearest neighbor similarity for each element in `R`.
|
||||
""")
|
||||
|
||||
st.subheader("6. Verification via Maximum Matching")
|
||||
st.markdown("""
|
||||
For the remaining candidates, we compute the **maximum weighted bipartite matching** between elements of `R` and `S`, using the chosen similarity function as edge weights.
|
||||
|
||||
Only sets whose matching score meets or exceeds a threshold `δ` are considered **related**.
|
||||
""")
|
||||
|
||||
st.markdown("---")
|
||||
|
||||
st.subheader("🧪 Modes of Operation")
|
||||
st.markdown("""
|
||||
- **Discovery Mode**: Compare all pairs of sets to find all related set pairs.
|
||||
**Use Case**: When you want to check which sets (e.g., columns in a database) are related to a specific reference set.
|
||||
- **Search Mode**: Given a reference set, find all sets related to it.
|
||||
**Use Case**: When you want to find all related set pairs in a dataset, for tasks like schema matching or entity deduplication.
|
||||
""")
|
||||
|
||||
st.markdown("---")
|
||||
|
||||
st.subheader("📐 Supported Similarity Functions")
|
||||
st.markdown("""
|
||||
- **Jaccard Similarity**
|
||||
- **Edit Similarity** (Levenshtein-based)
|
||||
- Optional **minimum similarity threshold** `α` can be enforced on element comparisons.
|
||||
""")
|
||||
2
frontend/requirements.txt
Normal file
2
frontend/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
streamlit==1.45.1
|
||||
matplotlib==3.10.3
|
||||
55
frontend/setup_unix.sh
Normal file
55
frontend/setup_unix.sh
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Name of the virtual environment
|
||||
ENV_NAME=".venv"
|
||||
|
||||
# Path to the requirements file
|
||||
REQUIREMENTS_FILE="requirements.txt"
|
||||
|
||||
# Check if requirements.txt exists
|
||||
if [ ! -f "$REQUIREMENTS_FILE" ]; then
|
||||
echo "Error: '$REQUIREMENTS_FILE' not found."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create the virtual environment
|
||||
python3 -m venv "$ENV_NAME"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to create virtual environment."
|
||||
exit 1
|
||||
fi
|
||||
echo "Virtual environment '$ENV_NAME' created."
|
||||
|
||||
# Activate the virtual environment
|
||||
source "$ENV_NAME/bin/activate"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to activate virtual environment."
|
||||
exit 1
|
||||
fi
|
||||
echo "Virtual environment '$ENV_NAME' activated."
|
||||
|
||||
# Install the requirements
|
||||
pip install -r "$REQUIREMENTS_FILE"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to install requirements."
|
||||
exit 1
|
||||
fi
|
||||
echo "Requirements from '$REQUIREMENTS_FILE' installed."
|
||||
|
||||
# Install the silkmoth package
|
||||
pip install -e ../src
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to install the silkmoth package."
|
||||
exit 1
|
||||
fi
|
||||
echo "Silkmoth package installed."
|
||||
|
||||
# Check if the virtual environment is active
|
||||
if [ -z "$VIRTUAL_ENV" ]; then
|
||||
echo "Error: Virtual environment activation failed."
|
||||
exit 1
|
||||
fi
|
||||
echo "Virtual environment '$ENV_NAME' is ready to use."
|
||||
|
||||
# Run the Streamlit app
|
||||
streamlit run app.py
|
||||
51
frontend/setup_win.bat
Normal file
51
frontend/setup_win.bat
Normal file
@@ -0,0 +1,51 @@
|
||||
@echo off
|
||||
|
||||
:: Name of the virtual environment
|
||||
set ENV_NAME=.venv
|
||||
|
||||
:: Path to the requirements file
|
||||
set REQUIREMENTS_FILE=requirements.txt
|
||||
|
||||
:: Check if requirements.txt exists
|
||||
if not exist "%REQUIREMENTS_FILE%" (
|
||||
echo Error: '%REQUIREMENTS_FILE%' not found.
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
:: Create the virtual environment
|
||||
python -m venv "%ENV_NAME%"
|
||||
if errorlevel 1 (
|
||||
echo Failed to create virtual environment.
|
||||
exit /b 1
|
||||
)
|
||||
echo Virtual environment '%ENV_NAME%' created.
|
||||
|
||||
:: Activate the virtual environment
|
||||
call "%ENV_NAME%\Scripts\activate.bat"
|
||||
|
||||
:: Install the requirements
|
||||
pip install -r "%REQUIREMENTS_FILE%"
|
||||
if errorlevel 1 (
|
||||
echo Failed to install requirements.
|
||||
exit /b 1
|
||||
)
|
||||
echo Requirements from '%REQUIREMENTS_FILE%' installed.
|
||||
|
||||
:: Install the silkmoth package
|
||||
pip install -e ../src
|
||||
if errorlevel 1 (
|
||||
echo Failed to install the silkmoth package.
|
||||
exit /b 1
|
||||
)
|
||||
echo Silkmoth package installed.
|
||||
|
||||
powershell -ExecutionPolicy Bypass -File "%ENV_NAME%\Scripts\Activate.ps1"
|
||||
echo Virtual environment '%ENV_NAME%' activated.
|
||||
|
||||
:: Check if the activation was successful
|
||||
if not defined VIRTUAL_ENV (
|
||||
echo Error: Virtual environment activation failed.
|
||||
exit /b 1
|
||||
)
|
||||
echo Virtual environment '%ENV_NAME%' is ready to use.
|
||||
streamlit run app.py
|
||||
43
frontend/utils.py
Normal file
43
frontend/utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def plot_elapsed_times(related_thresholds, elapsed_times_list, fig_text, file_name, xlabel=r'$\theta$', ylabel='Time (s)', title=None, legend_labels=None):
|
||||
"""
|
||||
Utility function to plot elapsed times against related thresholds for multiple settings.
|
||||
|
||||
Args:
|
||||
related_thresholds (list): Related thresholds (x-axis values).
|
||||
elapsed_times_list (list of lists): List of elapsed times (y-axis values) for different settings.
|
||||
fig_text (str): Text to display on the figure.
|
||||
file_name (str): Name of the file to save the plot.
|
||||
xlabel (str): Label for the x-axis.
|
||||
ylabel (str): Label for the y-axis.
|
||||
title (str): Title of the plot (optional).
|
||||
legend_labels (list): List of labels for the legend (optional).
|
||||
"""
|
||||
fig = plt.figure(figsize=(8, 6))
|
||||
|
||||
# Plot each elapsed_times list with a different color and label
|
||||
for i, elapsed_times in enumerate(elapsed_times_list):
|
||||
label = legend_labels[i] if legend_labels and i < len(legend_labels) else f"Setting {i + 1}"
|
||||
plt.plot(related_thresholds, elapsed_times, marker='o', label=label)
|
||||
|
||||
plt.xlabel(xlabel, fontsize=14)
|
||||
plt.ylabel(ylabel, fontsize=14)
|
||||
|
||||
plt.xticks(related_thresholds)
|
||||
|
||||
if title:
|
||||
plt.title(title, fontsize=16)
|
||||
|
||||
plt.grid(True)
|
||||
if legend_labels:
|
||||
plt.legend(fontsize=12)
|
||||
plt.tight_layout()
|
||||
|
||||
# Add figure text
|
||||
plt.figtext(0.1, 0.01, fig_text, ha='left', fontsize=10)
|
||||
|
||||
# Save the figure
|
||||
plt.savefig(file_name, bbox_inches='tight', dpi=300)
|
||||
|
||||
return fig
|
||||
Reference in New Issue
Block a user