import streamlit as st import pandas as pd import json import os # Directory containing the JSON files data_folder = "../experiments/data/webtables/" # JSON files to be used reference_file = "reference_sets_inclusion_dependency.json" source_file = "source_sets_inclusion_dependency.json" schema_matching_file = "webtable_schemas_sets_500k.json" # Full paths to the selected files reference_file_path = os.path.join(data_folder, reference_file) source_file_path = os.path.join(data_folder, source_file) schema_matching_file_path = os.path.join(data_folder, schema_matching_file) st.title("Datasets") st.divider() st.markdown( """

This page provides an interactive interface to explore the datasets utilized in the SilkMoth Engine experiments. Please note that only a fraction of the data is displayed due to constraints. We perform three types of experiments using two primary data sources:

Schema Matching Experiment: Utilizes 500,000 Webtable schemas for both the reference and source sets.
Inclusion Dependency Experiment: Involves 500,000 Webtable columns in the source set, with 1,000 of these selected as the reference set.
String Matching Experiment: Employs the DPLP dataset for matching tasks.

""", unsafe_allow_html=True, ) st.divider() st.subheader("Schema Matching Dataset") # Load and display the schema matching dataset try: with open(schema_matching_file_path, 'r', encoding='utf-8') as schema_file: schema_data = json.load(schema_file) schema_df = pd.DataFrame(schema_data).head(50) st.dataframe(schema_df) except Exception as e: st.error(f"Error loading schema matching dataset: {e}") st.divider() st.subheader("Inclusion Dependency Datasets") # Load and display the reference sets st.subheader("Reference/Source Sets") try: with open(reference_file_path, 'r', encoding='utf-8') as ref_file: reference_sets = json.load(ref_file) ref_df = pd.DataFrame(reference_sets).head(50) st.dataframe(ref_df) except Exception as e: st.error(f"Error loading reference sets: {e}")