init

2025-09-08 19:05:42 +02:00
commit d85c1c86df
153 changed files with 140246 additions and 0 deletions
--- a/experiments/data_loader.py
+++ b/experiments/data_loader.py
@@ -0,0 +1,174 @@
+import random
+import os
+import pandas as pd
+
+from utils import *
+
+
+class DataLoader:
+    def __init__(self, data_path):
+        self.data_path = data_path
+        self.files = os.listdir(data_path)
+
+    def load_webtable_columns_randomized(self, reference_set_amount: int, source_set_amount: int) -> tuple[list, list]:
+        """
+        Get randomized reference sets and source sets of webtable columns.
+        Reference sets are subsets of the source sets.
+        Only columns with 4 or more different elements are considered.
+        Only considering columns with non-numeric values.
+
+        Args:
+            reference_set_amount (int): Number of reference sets to return.
+            source_set_amount (int): Number of source sets to return.
+        Returns:
+            tuple: A tuple containing a list of reference sets and a list of source sets.
+        """
+        # Basic validation of input parameters
+        if reference_set_amount < 1 or source_set_amount < 2:
+            raise ValueError("reference_set_amount must be at least 1 and source_set_amount must be at least 2")
+        if reference_set_amount >= source_set_amount:
+            raise ValueError("reference_set_amount must be smaller than source_set_amount")
+        if reference_set_amount > len(self.files):
+            raise ValueError("reference_set_amount must be smaller than the number of files in data_path")
+        if source_set_amount > len(self.files):
+            raise ValueError("source_set_amount must be smaller than the number of files in data_path")
+        if len(self.files) == 0:
+            raise ValueError("data_path does not contain any files")
+
+
+        # Randomly select a reference set and source sets
+        source_set_nums = random.sample(range(len(self.files)), source_set_amount)
+
+        # Pick source_set_amount of columns which have at least 4 different elements
+        source_sets = []
+        while len(source_sets) < source_set_amount:
+            # Pick a random number from the source_set_nums
+            source_set_num = random.choice(source_set_nums)
+            file_path = os.path.join(self.data_path, self.files[source_set_num])
+
+            try:
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    json_data = json.load(file)
+                    if "relation" in json_data and isinstance(json_data["relation"], list):
+                        # pick random column
+                        col = random.randint(0, len(json_data["relation"]) - 1)
+                        col = json_data["relation"][col]
+
+                        # Check if the column has at least 4 different elements and contains no numeric values
+                        if len(set(col)) >= 4:
+                            if all(not is_convertible_to_number(value) and len(value) > 0 for value in col):
+                                # Add the column to the source sets
+                                source_sets.append(col)
+                            print(f"Source set number {len(source_sets)} loaded")
+
+            except Exception as e:
+                raise ValueError(f"Error loading JSON file: {e}")
+
+        # Randomly select reference sets from the source sets
+        reference_sets = random.sample(source_sets, reference_set_amount)
+        return reference_sets, source_sets
+
+    def load_webtable_reference_sets_element_restriction(self, source_set: list, element_restriction: int) -> list:
+        """
+        Get a reference set of webtable columns with a specific element restriction.
+        Restriction is the minimal number of elements allowed in the reference set.
+
+        Args:
+            source_set (list): The source set to use for generating the reference set.
+            element_restriction (int): The number of elements in the reference set.
+        Returns:
+            list: A list of reference sets.
+        """
+        if element_restriction < 1:
+            raise ValueError("element_restriction must be at least 1")
+
+        reference_sets = []
+
+        while len(reference_sets) < 1000:
+            # Randomly select a column from the source set
+            col = random.choice(source_set)
+
+            # Check if the column has at least element_restriction different elements
+            if len(col) >= element_restriction:
+                    reference_sets.append(col)
+                    print(f"Reference set number {len(reference_sets)} loaded")
+
+        return reference_sets
+
+    def load_webtable_schemas_randomized(self, set_amount: int) -> list:
+        if set_amount < 2:
+            raise ValueError("source_set_amount must be at least 2")
+        # Random sequence of table numbers
+        table_nums = random.sample(range(len(self.files)), len(self.files))
+
+        schema_sets = []
+
+        i = 0
+        while len(schema_sets) < set_amount and i < len(table_nums):
+            try:
+                # Load the schema for the current table number
+                schema = self.load_single_webtable_schema(table_nums[i])
+                schema_sets.append(schema)
+                print(f"Schema set number {len(schema_sets)} loaded")
+                i += 1
+            except ValueError as e:
+                print(f"Skipping table number {table_nums[i]} due to error: {e}")
+                i += 1
+
+        return schema_sets
+
+    def load_single_webtable_schema(self, reference_set_num: int) -> list:
+        # Load the webtable schema for the given reference set number
+        if reference_set_num < 0 or reference_set_num >= len(self.files):
+            raise IndexError("reference_set_num is out of range")
+
+        # Get the file at the specified position
+        file_path = os.path.join(self.data_path, self.files[reference_set_num])
+
+        # Load and return the JSON content
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                json_data = json.load(file)
+                if "relation" in json_data and isinstance(json_data["relation"], list):
+                    schema = [relation[0] for relation in json_data["relation"]]
+                    if len(schema) == 0:
+                        raise ValueError("Schema is empty")
+
+                    if all(not is_convertible_to_number(col) for col in schema):
+                        # remove "" empty strings from the schema
+                        schema = [col for col in schema if len(col) > 0]
+                        if len(schema) == 0:
+                            raise ValueError("Schema contains only empty strings")
+                        return schema
+                    else:
+                        raise ValueError("Schema contains numeric values or is empty")
+                else:
+                    raise ValueError("JSON does not contain a valid 'relation' key or it is not a list")
+        except Exception as e:
+            raise ValueError(f"Error loading JSON file: {e}")
+        
+
+
+
+    def load_dblp_titles(self, data_path: str) -> list:
+        """
+        Load DBLP paper titles from a CSV file.
+
+        Args:
+            data_path (str): Path to CSV file containing a column 'title'.
+
+        Returns:
+            list: A list of title strings.
+        """
+
+        if not os.path.exists(data_path):
+            raise FileNotFoundError(f"DBLP CSV file not found: {data_path}")
+        
+        df = pd.read_csv(data_path)
+        if "title" not in df.columns:
+            raise ValueError("CSV must contain a 'title' column")
+
+        titles = df["title"].dropna().tolist()
+        return titles
+
+