SilkMoth/experiments/data_loader.py

import random
import os
import pandas as pd

from utils import *


class DataLoader:
    def __init__(self, data_path):
        self.data_path = data_path
        self.files = os.listdir(data_path)

    def load_webtable_columns_randomized(self, reference_set_amount: int, source_set_amount: int) -> tuple[list, list]:
        """
        Get randomized reference sets and source sets of webtable columns.
        Reference sets are subsets of the source sets.
        Only columns with 4 or more different elements are considered.
        Only considering columns with non-numeric values.

        Args:
            reference_set_amount (int): Number of reference sets to return.
            source_set_amount (int): Number of source sets to return.
        Returns:
            tuple: A tuple containing a list of reference sets and a list of source sets.
        """
        # Basic validation of input parameters
        if reference_set_amount < 1 or source_set_amount < 2:
            raise ValueError("reference_set_amount must be at least 1 and source_set_amount must be at least 2")
        if reference_set_amount >= source_set_amount:
            raise ValueError("reference_set_amount must be smaller than source_set_amount")
        if reference_set_amount > len(self.files):
            raise ValueError("reference_set_amount must be smaller than the number of files in data_path")
        if source_set_amount > len(self.files):
            raise ValueError("source_set_amount must be smaller than the number of files in data_path")
        if len(self.files) == 0:
            raise ValueError("data_path does not contain any files")


        # Randomly select a reference set and source sets
        source_set_nums = random.sample(range(len(self.files)), source_set_amount)

        # Pick source_set_amount of columns which have at least 4 different elements
        source_sets = []
        while len(source_sets) < source_set_amount:
            # Pick a random number from the source_set_nums
            source_set_num = random.choice(source_set_nums)
            file_path = os.path.join(self.data_path, self.files[source_set_num])

            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    json_data = json.load(file)
                    if "relation" in json_data and isinstance(json_data["relation"], list):
                        # pick random column
                        col = random.randint(0, len(json_data["relation"]) - 1)
                        col = json_data["relation"][col]

                        # Check if the column has at least 4 different elements and contains no numeric values
                        if len(set(col)) >= 4:
                            if all(not is_convertible_to_number(value) and len(value) > 0 for value in col):
                                # Add the column to the source sets
                                source_sets.append(col)
                            print(f"Source set number {len(source_sets)} loaded")

            except Exception as e:
                raise ValueError(f"Error loading JSON file: {e}")

        # Randomly select reference sets from the source sets
        reference_sets = random.sample(source_sets, reference_set_amount)
        return reference_sets, source_sets

    def load_webtable_reference_sets_element_restriction(self, source_set: list, element_restriction: int) -> list:
        """
        Get a reference set of webtable columns with a specific element restriction.
        Restriction is the minimal number of elements allowed in the reference set.

        Args:
            source_set (list): The source set to use for generating the reference set.
            element_restriction (int): The number of elements in the reference set.
        Returns:
            list: A list of reference sets.
        """
        if element_restriction < 1:
            raise ValueError("element_restriction must be at least 1")

        reference_sets = []

        while len(reference_sets) < 1000:
            # Randomly select a column from the source set
            col = random.choice(source_set)

            # Check if the column has at least element_restriction different elements
            if len(col) >= element_restriction:
                    reference_sets.append(col)
                    print(f"Reference set number {len(reference_sets)} loaded")

        return reference_sets

    def load_webtable_schemas_randomized(self, set_amount: int) -> list:
        if set_amount < 2:
            raise ValueError("source_set_amount must be at least 2")
        # Random sequence of table numbers
        table_nums = random.sample(range(len(self.files)), len(self.files))

        schema_sets = []

        i = 0
        while len(schema_sets) < set_amount and i < len(table_nums):
            try:
                # Load the schema for the current table number
                schema = self.load_single_webtable_schema(table_nums[i])
                schema_sets.append(schema)
                print(f"Schema set number {len(schema_sets)} loaded")
                i += 1
            except ValueError as e:
                print(f"Skipping table number {table_nums[i]} due to error: {e}")
                i += 1

        return schema_sets

    def load_single_webtable_schema(self, reference_set_num: int) -> list:
        # Load the webtable schema for the given reference set number
        if reference_set_num < 0 or reference_set_num >= len(self.files):
            raise IndexError("reference_set_num is out of range")

        # Get the file at the specified position
        file_path = os.path.join(self.data_path, self.files[reference_set_num])

        # Load and return the JSON content
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)
                if "relation" in json_data and isinstance(json_data["relation"], list):
                    schema = [relation[0] for relation in json_data["relation"]]
                    if len(schema) == 0:
                        raise ValueError("Schema is empty")

                    if all(not is_convertible_to_number(col) for col in schema):
                        # remove "" empty strings from the schema
                        schema = [col for col in schema if len(col) > 0]
                        if len(schema) == 0:
                            raise ValueError("Schema contains only empty strings")
                        return schema
                    else:
                        raise ValueError("Schema contains numeric values or is empty")
                else:
                    raise ValueError("JSON does not contain a valid 'relation' key or it is not a list")
        except Exception as e:
            raise ValueError(f"Error loading JSON file: {e}")


    def load_dblp_titles(self, data_path: str) -> list:
        """
        Load DBLP paper titles from a CSV file.

        Args:
            data_path (str): Path to CSV file containing a column 'title'.

        Returns:
            list: A list of title strings.
        """

        if not os.path.exists(data_path):
            raise FileNotFoundError(f"DBLP CSV file not found: {data_path}")

        df = pd.read_csv(data_path)
        if "title" not in df.columns:
            raise ValueError("CSV must contain a 'title' column")

        titles = df["title"].dropna().tolist()
        return titles