import streamlit as st
import pandas as pd
import json
import os
# Directory containing the JSON files
data_folder = "../experiments/data/webtables/"
# JSON files to be used
reference_file = "reference_sets_inclusion_dependency.json"
source_file = "source_sets_inclusion_dependency.json"
schema_matching_file = "webtable_schemas_sets_500k.json"
# Full paths to the selected files
reference_file_path = os.path.join(data_folder, reference_file)
source_file_path = os.path.join(data_folder, source_file)
schema_matching_file_path = os.path.join(data_folder, schema_matching_file)
st.title("Datasets")
st.divider()
st.markdown(
"""
This page provides an interactive interface to explore the datasets utilized in the SilkMoth Engine experiments.
Please note that only a fraction of the data is displayed due to constraints.
We perform three types of experiments using two primary data sources:
- Schema Matching Experiment: Utilizes 500,000 Webtable schemas for both the reference and source sets.
- Inclusion Dependency Experiment: Involves 500,000 Webtable columns in the source set, with 1,000 of these selected as the reference set.
- String Matching Experiment: Employs the DPLP dataset for matching tasks.
""",
unsafe_allow_html=True,
)
st.divider()
st.subheader("Schema Matching Dataset")
# Load and display the schema matching dataset
try:
with open(schema_matching_file_path, 'r', encoding='utf-8') as schema_file:
schema_data = json.load(schema_file)
schema_df = pd.DataFrame(schema_data).head(50)
st.dataframe(schema_df)
except Exception as e:
st.error(f"Error loading schema matching dataset: {e}")
st.divider()
st.subheader("Inclusion Dependency Datasets")
# Load and display the reference sets
st.subheader("Reference/Source Sets")
try:
with open(reference_file_path, 'r', encoding='utf-8') as ref_file:
reference_sets = json.load(ref_file)
ref_df = pd.DataFrame(reference_sets).head(50)
st.dataframe(ref_df)
except Exception as e:
st.error(f"Error loading reference sets: {e}")