Skip to content

Preprocessing

In-built preprocessing functions to handle graph data.

apply_scaler(dataset, method='zero_mean', target='node') #

Applies the selected scaling method to the provided dataset. After scaling, the used scaler instance is accessible through the StaticGraphDataset instance as either node_scaler or edge_scaler depending on the given scaling target.

The dataset needs to be split with either of the create_train_test_split-methods in order to correctly apply scaling. (Fitting only on training data and applying to training and test data)

Scaling is applied to both the inputs and labels and done per feature. For time-series data, this means that each feature of every graph in the input sequence is scaled independently to avoid weighting repetitions in the sequence too much.

Parameters:

Name Type Description Default
dataset StaticGraphDataset

Dataset to be scaled

required
method str

Scaling method to be applied. Either zero_mean or min_max

'zero_mean'
target str

Either node or edge. Selects whether to scale the node features of each graph or the edge features (if they are present)

'node'

Returns:

Type Description
Tuple[GraphList, GraphList] | Tuple[GraphList, GraphList, GraphList, GraphList]

Either a 4-tuple of scaled data if the dataset consists of time-series data. Else a 2-tuple of the scaled train and test data.

Source code in graphs_on_grids/preprocessing/preprocessing.py
def apply_scaler(
    dataset: StaticGraphDataset, method: str = "zero_mean", target: str = "node"
) -> Tuple[GraphList, GraphList] | Tuple[GraphList, GraphList, GraphList, GraphList]:
    """Applies the selected scaling method to the provided dataset. After scaling, the used scaler instance is
    accessible through the `StaticGraphDataset` instance as either `node_scaler` or `edge_scaler` depending on the
     given scaling target.

     The dataset needs to be split with either of the `create_train_test_split`-methods in order to correctly apply
     scaling. (Fitting only on training data and applying to training and test data)

     Scaling is applied to both the inputs and labels and done per feature. For time-series data, this means that
     each feature of every graph in the input sequence is scaled independently to avoid weighting repetitions in the
     sequence too much.

    :param dataset: Dataset to be scaled
    :param method: Scaling method to be applied. Either `zero_mean` or `min_max`
    :param target: Either `node` or `edge`. Selects whether to scale the node features of each graph or the edge
    features (if they are present)
    :return: Either a 4-tuple of scaled data if the dataset consists of time-series data. Else a 2-tuple of the scaled
     train and test data.
    """
    is_time_series = False
    if not isinstance(dataset, StaticGraphDataset):
        raise ValueError(
            f"Expected input to be of type {StaticGraphDataset.type()}. Received type {type(dataset)}"
        )

    if not dataset.train or not dataset.test:
        raise ValueError(
            f"The dataset has not yet been split into a training or test set. Did you already call 'create_train_test_split' on this dataset?"
        )

    train, _, test = dataset.get_splits()
    if isinstance(train, GraphList):
        train, test = train.to_pandas(), test.to_pandas()
    else:
        is_time_series = True
        X_train, y_train, X_test, y_test = (
            train[0].to_pandas(),
            train[1].to_pandas(),
            test[0].to_pandas(),
            test[1].to_pandas(),
        )
        if isinstance(X_train, List):
            if target == "node":
                X_train, y_train, X_test, y_test = (
                    X_train[0],
                    y_train[0],
                    X_test[0],
                    y_test[0],
                )
            elif target == "edge":
                X_train, y_train, X_test, y_test = (
                    X_train[1],
                    y_train[1],
                    X_test[1],
                    y_test[1],
                )
            else:
                raise ValueError(
                    f"Expected target to be either 'node' or 'edge'. Received {target}"
                )
        train = pd.concat([X_train, y_train], axis=1)
        test = pd.concat([X_test, y_test], axis=1)
        del X_train, X_test, y_train, y_test

    if isinstance(train, list) and isinstance(test, list):
        if target == "node":
            train, test = train[0], test[0]
        elif target == "edge":
            train, test = train[1], test[1]
        else:
            raise ValueError(
                f"Expected target to be either 'node' or 'edge'. Received {target}"
            )
    else:
        if target == "edge" and not is_time_series:
            raise ValueError(
                f"Expected dataset to contain edge features with for target 'edge'"
            )

    scaler = None
    if method == "zero_mean":
        scaler = StandardScaler()
    elif method == "min_max":
        scaler = MinMaxScaler()
    if scaler is None:
        raise ValueError(
            f"Invalid method={method} provided. Only 'zero_mean' or 'min_max' are valid'"
        )

    train = scaler.fit_transform(train)
    test = scaler.transform(test)

    if target == "node":
        num_nodes = dataset.adjacency_matrix.shape[0]
        dataset.node_scaler = scaler
        _replace_node_features(num_nodes, dataset.train, train)
        _replace_node_features(num_nodes, dataset.test, test)
    elif target == "edge":
        num_edges = np.count_nonzero(dataset.adjacency_matrix == 1)
        dataset.edge_scaler = scaler
        _replace_edge_features(num_edges, dataset.train, train)
        _replace_edge_features(num_edges, dataset.test, test)

    if is_time_series:
        return dataset.train[0], dataset.test[0], dataset.train[1], dataset.test[1]
    return dataset.train, dataset.test

create_train_test_split(dataset, train_size=0.8, random_state=None, shuffle=True) #

Create a train-test-split from an instance of gog.structure.graph.StaticGraphDataset()

Parameters:

Name Type Description Default
dataset StaticGraphDataset

Dataset to be split

required
train_size

Relative size of the training set as a value between 0 and 1. The test set will contain percent of the instances

0.8
random_state

Sets the random state for shuffling

None
shuffle

Whether to shuffle the data before splitting.

True

Returns:

Type Description
Tuple[GraphList, GraphList]

Tuple of gog.structure.graph.GraphList() instances containing the train and test set

Source code in graphs_on_grids/preprocessing/preprocessing.py
def create_train_test_split(
    dataset: StaticGraphDataset, train_size=0.8, random_state=None, shuffle=True
) -> Tuple[GraphList, GraphList]:
    r"""Create a train-test-split from an instance of `gog.structure.graph.StaticGraphDataset()`

    :param dataset: Dataset to be split
    :param train_size: Relative size of the training set as a  value between 0 and 1. The test set will contain
     \( 1 - train\_size \) percent of the instances
    :param random_state: Sets the random state for shuffling
    :param shuffle: Whether to shuffle the data before splitting.
    :return: Tuple of `gog.structure.graph.GraphList()` instances containing the train and test set
    """
    if not isinstance(dataset, StaticGraphDataset):
        raise ValueError(
            f"Expected input to be of type {StaticGraphDataset.type()}. Received type {type(dataset)}"
        )

    graph_list = dataset.graphs
    num_graphs = len(graph_list)
    if shuffle:
        np.random.seed(random_state)
        np.random.shuffle(graph_list)
    last_train_index = int(num_graphs * train_size)
    train, test = (
        graph_list[0:last_train_index],
        graph_list[last_train_index : num_graphs + 1],
    )
    dataset.set_splits(train=train, test=test)
    return train, test

create_train_test_split_windowed(dataset, window_size, len_labels=1, step=1, start=0, train_size=0.8, random_state=None, shuffle=False) #

Creates a windowed dataset from the provided StaticGraphDatasetinstance. After that, a train-test-split is created from the windowed data

Parameters:

Name Type Description Default
dataset StaticGraphDataset

Dataset to be windowed and split

required
window_size int

Sequence length of to be provided as input to the model

required
len_labels int

The output sequence length to be predicted by the model

1
step int

Step size of the windowing algorithm. Describes how much the window start is shifted after creating a window instance. If set to window_size, each graph in the dataset is only used for a single instance.

1
start int

Start index for windowing

0
train_size float

Relative size of the training set as a value between 0 and 1. The test set will contain percent of the instances

0.8
random_state int

Sets the random state for shuffling

None
shuffle bool

Whether to shuffle the data before splitting.

False

Returns:

Type Description
Tuple[GraphList, GraphList, GraphList, GraphList]

Tuple of gog.structure.graph.GraphList() instances containing the train and test instances and labels

Source code in graphs_on_grids/preprocessing/preprocessing.py
def create_train_test_split_windowed(
    dataset: StaticGraphDataset,
    window_size: int,
    len_labels: int = 1,
    step: int = 1,
    start: int = 0,
    train_size: float = 0.8,
    random_state: int = None,
    shuffle: bool = False,
) -> Tuple[GraphList, GraphList, GraphList, GraphList]:
    r"""Creates a windowed dataset from the provided `StaticGraphDataset`instance. After that, a train-test-split
    is created from the windowed data

    :param dataset: Dataset to be windowed and split
    :param window_size: Sequence length of to be provided as input to the model
    :param len_labels: The output sequence length to be predicted by the model
    :param step:  Step size of the windowing algorithm. Describes how much the window start is shifted after creating a
     window instance. If set to `window_size`, each graph in the dataset is only used for a single instance.
    :param start: Start index for windowing
    :param train_size: Relative size of the training set as a  value between 0 and 1. The test set will contain
    \( 1 - train\_size \) percent of the instances
    :param random_state: Sets the random state for shuffling
    :param shuffle: Whether to shuffle the data before splitting.
    :return: Tuple of `gog.structure.graph.GraphList()` instances containing the train and test instances and labels
    """
    if not isinstance(dataset, StaticGraphDataset):
        raise ValueError(
            f"Expected input to be of type {StaticGraphDataset.type()}. Received type {type(dataset)}"
        )
    graphs = dataset.graphs
    windows = GraphList(
        num_nodes=graphs.num_nodes,
        node_feature_names=graphs.node_feature_names,
        num_edges=graphs.num_edges,
        edge_feature_names=graphs.edge_feature_names,
        strict_checks=False,
    )
    labels = GraphList(
        num_nodes=graphs.num_nodes,
        node_feature_names=graphs.node_feature_names,
        num_edges=graphs.num_edges,
        edge_feature_names=graphs.edge_feature_names,
        strict_checks=False,
    )
    num_graphs = len(graphs)
    while start + window_size + len_labels < num_graphs:
        end = start + window_size
        current_window = graphs[start:end]
        label_window = graphs[end + 1 : end + 1 + len_labels]
        if len(current_window) == window_size:
            labels.append(label_window)
            windows.append(current_window)
        start = start + step
    discarded_graphs = num_graphs - 1 - labels[-1][-1].ID
    if discarded_graphs != 0:
        logging.warning(
            f"Dataset of size {num_graphs}, cannot be cleanly divided with window size {window_size}. Discarded {discarded_graphs} graph instances."
        )
    dataset.graphs = windows
    dataset.graphs.strict_checks = False

    if shuffle:
        np.random.seed(random_state)
        np.random.shuffle(windows)
        np.random.shuffle(labels)

    num_instances = len(windows)
    last_train_index = int(num_instances * train_size)
    X_train, X_test, y_train, y_test = (
        windows[0:last_train_index].copy(),
        windows[last_train_index : num_instances + 1].copy(),
        labels[0:last_train_index].copy(),
        labels[last_train_index : num_instances + 1].copy(),
    )
    dataset.set_splits(train=[X_train, y_train], test=[X_test, y_test])
    return X_train, X_test, y_train, y_test

create_validation_set(X, y, validation_size=0.2) #

Creates a validation set from provided data.

Parameters:

Name Type Description Default
X GraphList

Training data to be split

required
y GraphList

Labels of training data to be split

required
validation_size float

Relative size of validation set. The training set will be of size percent of the original training set.

0.2

Returns:

Type Description
Tuple[GraphList, GraphList, GraphList, GraphList]

A 4-Tuple of the training and validation set inputs and targets.

Source code in graphs_on_grids/preprocessing/preprocessing.py
def create_validation_set(
    X: GraphList, y: GraphList, validation_size: float = 0.2
) -> Tuple[GraphList, GraphList, GraphList, GraphList]:
    r"""Creates a validation set from provided data.

    :param X: Training data to be split
    :param y: Labels of training data to be split
    :param validation_size: Relative size of validation set. The training set will be of size \( 1 - validation\_size \)
     percent of the original training set.
    :return: A 4-Tuple of the training and validation set inputs and targets.
    """
    if not isinstance(X, GraphList) or not isinstance(y, GraphList):
        raise ValueError(
            f"Expected both inputs for X and y to be of type {type(GraphList())}. Received types {type(X), type(y)}"
        )
    if len(X) != len(y):
        raise ValueError(
            f"Expected same number of instances in X and y. Received {len(X), len(y)}"
        )
    if isinstance(X[-1], GraphList):
        return _create_time_series_validation_set(X, y, validation_size)

    split_idx = int(len(X) * validation_size)
    X_train, X_val = X[split_idx:], X[:split_idx]
    split_idx = int(len(y) * validation_size)
    y_train, y_val = y[split_idx:], y[:split_idx]
    return X_train, X_val, y_train, y_val

mask_features(X_train, X_test, targets, node_indices, method='zeros') #

Masks selected features of nodes at the provided indices by either a set or random value.

Parameters:

Name Type Description Default
X_train GraphList

Training set

required
X_test GraphList

Test set

required
targets List[str]

Which node features to mask

required
node_indices List | ndarray

Which nodes to apply the feature masking to

required
method str

Either zeros, ones or random

'zeros'

Returns:

Type Description
Tuple[GraphList, GraphList]

A pair of the masked train and test split

Source code in graphs_on_grids/preprocessing/preprocessing.py
def mask_features(
    X_train: GraphList,
    X_test: GraphList,
    targets: List[str],
    node_indices: List | np.ndarray,
    method: str = "zeros",
) -> Tuple[GraphList, GraphList]:
    """Masks selected features of nodes at the provided indices by either a set or random value.

    :param X_train: Training set
    :param X_test: Test set
    :param targets: Which node features to mask
    :param node_indices: Which nodes to apply the feature masking to
    :param method: Either `zeros`, `ones` or `random`
    :return: A pair of the masked train and test split
    """
    if not isinstance(X_train, GraphList) or not isinstance(X_test, GraphList):
        raise ValueError(
            f"Expected both inputs for X_train and X_test to be of type {type(GraphList)}. Received types {type(X_train), type(X_test)}"
        )

    feature_indices = _get_feature_indices(X_train, X_test, targets, len(node_indices))

    X_train_mask = GraphList(
        [
            _mask_split(
                graph.__copy__(), targets, node_indices, method, feature_indices
            )
            for graph in X_train
        ],
        X_train.num_nodes,
        X_train.node_feature_names,
        num_edges=X_train.num_edges,
        edge_feature_names=X_train.edge_feature_names,
        strict_checks=X_train.strict_checks,
    )
    X_test_mask = GraphList(
        [
            _mask_split(
                graph.__copy__(), targets, node_indices, method, feature_indices
            )
            for graph in X_test
        ],
        num_nodes=X_test.num_nodes,
        node_feature_names=X_test.node_feature_names,
        num_edges=X_test.num_edges,
        edge_feature_names=X_test.edge_feature_names,
        strict_checks=X_test.strict_checks,
    )

    return X_train_mask, X_test_mask