Skip to content

Post Study

Octopus post-study package — prediction and analysis from saved studies.

Top-level imports provide the core prediction interface. Analysis functions (tables, plots, notebook wrappers) are available via submodule imports::

from octopus.poststudy import OctoPredictor, OctoTestEvaluator
from octopus.poststudy.analysis.tables import get_performance
from octopus.poststudy.analysis.plots import dev_performance_plot, performance_plot
from octopus.poststudy.analysis.notebook import display_study_overview

OctoPredictor

Bases: _PredictorBase

Ensemble model for predicting on new, unseen data.

Wraps the fitted models from a single task across all outer splits. All methods require explicit data — no test/train data is stored. All results are computed fresh from loaded models.

Parameters:

Name Type Description Default
study_info

StudyInfo returned by load_study_information().

required
task_id

Concrete workflow task index (must be >= 0).

required
result_type

Result type for filtering results (default: 'best').

required

Raises:

Type Description
ValueError

If task_id is negative, out of range, or no models found.

FileNotFoundError

If expected study artifacts are missing.

Example

info = load_study_information("studies/my_study") tp = OctoPredictor(study_info=info, task_id=0) predictions = tp.predict(new_data)

Source code in octopus/poststudy/predict/predictor.py
@define(slots=False)
class OctoPredictor(_PredictorBase):
    """Ensemble model for predicting on new, unseen data.

    Wraps the fitted models from a single task across all outer splits.
    All methods require **explicit data** — no test/train data is stored.
    All results are computed fresh from loaded models.

    Args:
        study_info: ``StudyInfo`` returned by ``load_study_information()``.
        task_id: Concrete workflow task index (must be >= 0).
        result_type: Result type for filtering results (default: 'best').

    Raises:
        ValueError: If task_id is negative, out of range, or no models found.
        FileNotFoundError: If expected study artifacts are missing.

    Example:
        >>> info = load_study_information("studies/my_study")
        >>> tp = OctoPredictor(study_info=info, task_id=0)
        >>> predictions = tp.predict(new_data)
    """

    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        """Predict on new data using all outer-split models.

        Return a wide-format DataFrame with one row per sample.
        Columns: ``row_id``, ``split_0``, ``split_1``, ..., ``ensemble``.

        For regression and time-to-event, the ``ensemble`` column is the
        arithmetic mean of per-split predictions.  For classification, it
        contains class labels derived from the argmax of ensemble-averaged
        probabilities.

        Args:
            data: DataFrame containing feature columns.

        Returns:
            Wide-format DataFrame with per-split and ensemble predictions.
        """
        per_split_preds = {sid: self._predict_raw(sid, data) for sid in self._study_info.outersplits}

        result = pd.DataFrame({"row_id": self._get_row_ids(data)})
        for split_id, preds in per_split_preds.items():
            result[f"split_{split_id}"] = preds

        if self._study_info.ml_type in (MLType.BINARY, MLType.MULTICLASS):
            per_split_probas = {sid: self._predict_proba_raw(sid, data) for sid in self._study_info.outersplits}
            avg_proba = np.mean(list(per_split_probas.values()), axis=0)
            result["ensemble"] = self.classes_[np.argmax(avg_proba, axis=1)]
        else:
            result["ensemble"] = np.mean(list(per_split_preds.values()), axis=0)

        return result

    def predict_proba(self, data: pd.DataFrame) -> pd.DataFrame:
        """Predict probabilities on new data (classification/multiclass only).

        Return a wide-format DataFrame with one row per sample.
        Columns: ``row_id``, one column per class label (ensemble-averaged),
        then ``<class>_split_0``, ``<class>_split_1``, ... for per-split detail.

        Args:
            data: DataFrame containing feature columns.

        Returns:
            Wide-format DataFrame with ensemble and per-split probabilities.

        Raises:
            TypeError: If ml_type is not classification or multiclass.
        """
        self._check_classification_only("predict_proba")
        per_split_probas = {sid: self._predict_proba_raw(sid, data) for sid in self._study_info.outersplits}
        class_labels = self.classes_
        ensemble: np.ndarray = np.mean(list(per_split_probas.values()), axis=0)

        result = pd.DataFrame({"row_id": self._get_row_ids(data)})
        for i, label in enumerate(class_labels):
            result[label] = ensemble[:, i]
        for split_id, probas in per_split_probas.items():
            for i, label in enumerate(class_labels):
                result[f"{label}_split_{split_id}"] = probas[:, i]
        return result

    def performance(
        self,
        data: pd.DataFrame,
        metrics: list[str] | None = None,
        threshold: float = 0.5,
    ) -> pd.DataFrame:
        """Compute performance on provided data per outersplit with Mean and Ensemble.

        Each outer-split model is scored independently on the **same** data.
        The ``Mean`` row averages per-split scores.  The ``Ensemble`` row
        scores the ensemble-averaged predictions against ground truth.

        Args:
            data: Data to score on; must contain feature columns + target column.
            metrics: List of metric names to compute.
                If None, auto-detected from the ML type.
            threshold: Classification threshold for threshold-dependent metrics.

        Returns:
            Wide DataFrame with outersplit IDs as index (plus ``Mean`` and
            ``Ensemble``), metrics as columns.
        """
        metrics = self._resolve_metrics(metrics)
        data_per_split = dict.fromkeys(self._study_info.outersplits, data)
        df = self._compute_per_split_scores(data_per_split, metrics, threshold)
        df.loc["Mean"] = df.mean()

        pred_df = self.predict(data)
        pred_with_target = pd.DataFrame(
            {
                "prediction": pred_df["ensemble"].values,
                **self._get_target_columns(data),
            }
        )
        proba_df = self.predict_proba(data) if self._needs_proba(metrics) else None
        df.loc["Ensemble"] = self._compute_summary_scores(pred_with_target, proba_df, metrics, threshold)
        return df

    def _build_pool_data(self, data: pd.DataFrame) -> dict[int, pd.DataFrame]:
        """Build per-split pool data for permutation FI.

        In study-connected mode, filters ``data_prepared.parquet`` using the
        stored split row IDs to recover per-split traindev data.  In deployment
        mode (loaded via ``OctoPredictor.load``), the user-provided ``data`` is
        used as the pool for all splits.

        The prepared data is cached on first read so repeated ``calculate_fi``
        calls (e.g. permutation then SHAP) do not re-read from disk.

        Args:
            data: User-provided data (used as fallback for all splits).

        Returns:
            Dict mapping outersplit_id to pool DataFrame.
        """
        if not hasattr(self, "_prepared_data"):
            prepared_path = self._study_info.path / "data_prepared.parquet"
            self._prepared_data: pd.DataFrame | None = parquet_load(prepared_path) if prepared_path.exists() else None

        pool: dict[int, pd.DataFrame] = {}
        for split_id in self._study_info.outersplits:
            try:
                pool[split_id] = load_partition(self._study_info.path, split_id, "traindev", self._prepared_data)
            except (FileNotFoundError, KeyError):
                pool[split_id] = data

        return pool

    def calculate_fi(
        self,
        data: pd.DataFrame,
        fi_type: FIType = FIType.PERMUTATION,
        *,
        n_repeats: int = 10,
        feature_groups: dict[str, list[str]] | None = None,
        random_state: int = 42,
        **kwargs: Any,
    ) -> pd.DataFrame:
        """Calculate feature importance on provided data across all outer splits.

        Computes FI fresh from loaded models, providing p-values,
        confidence intervals, and group permutation support.

        Args:
            data: Data to compute FI on (must contain features + target).
            fi_type: Type of feature importance. One of:
                - ``FIType.PERMUTATION`` — Per-feature permutation importance.
                - ``FIType.GROUP_PERMUTATION`` — Per-feature + per-group permutation
                  importance.  Uses ``feature_groups`` (from study config or
                  explicitly provided) to also compute group-level importance.
                - ``FIType.SHAP`` — SHAP-based importance.  Pass ``shap_type`` as a
                  kwarg to select the explainer: ``"kernel"`` (default),
                  ``"permutation"``, or ``"exact"``.
            n_repeats: Number of permutation repeats (for permutation FI).
            feature_groups: Dict mapping group names to feature lists
                (for group_permutation).  If None and fi_type is
                ``FIType.GROUP_PERMUTATION``, groups are loaded from the study.
            random_state: Random seed.
            **kwargs: Additional keyword arguments passed to the FI function.
                For ``fi_type=FIType.SHAP``, supported kwargs include:
                ``shap_type`` (``"kernel"``, ``"permutation"``, ``"exact"``),
                ``max_samples``, ``background_size``.

        Returns:
            DataFrame with feature importance results including a ``fi_type``
            column and per-split + ensemble rows.

        Raises:
            ValueError: If fi_type is unknown.
        """
        fi_type = FIType(fi_type)

        # All splits share the same DataFrame reference.  Safe because
        # compute_permutation_single / compute_shap_single copy data before mutating.
        test_data = dict.fromkeys(self._study_info.outersplits, data)
        train_data = self._build_pool_data(data)

        return self._dispatch_fi(
            test_data,
            train_data,
            fi_type,
            n_repeats=n_repeats,
            feature_groups=feature_groups,
            random_state=random_state,
            **kwargs,
        )

    def save(self, path: str | UPath) -> None:
        """Save the predictor for standalone deployment.

        Writes a self-contained directory with models + metadata only
        (no data). The saved predictor can be loaded later without the
        original study directory.

        Args:
            path: Directory path to save to. Created if it doesn't exist.
        """
        save_dir = UPath(path)
        save_dir.mkdir(parents=True, exist_ok=True)

        si = self._study_info
        metadata = {
            "task_id": self._task_id,
            "ml_type": si.ml_type,
            "target_metric": si.target_metric,
            "target_col": si.target_col,
            "target_assignments": si.target_assignments,
            "positive_class": si.positive_class,
            "row_id_col": si.row_id_col,
            "feature_cols": self._feature_cols,
            "outersplits": si.outersplits,
            "result_type": self._result_type,
            "feature_cols_per_split": {str(k): v for k, v in self._feature_cols_per_split.items()},
            "feature_groups_per_split": {str(k): v for k, v in self._feature_groups_per_split.items()},
        }
        with (save_dir / "metadata.json").open("w") as f:
            json.dump(metadata, f, indent=2, default=str)

        models_dir = save_dir / "models"
        models_dir.mkdir(parents=True, exist_ok=True)
        for split_id in si.outersplits:
            joblib_save(self._models[split_id], models_dir / f"model_{split_id:03d}.joblib")

        with (save_dir / "version.json").open("w") as f:
            json.dump({"octopus_version": get_version()}, f, indent=2)

    @classmethod
    def load(cls, path: str | UPath) -> OctoPredictor:
        """Load a previously saved predictor.

        Args:
            path: Directory path containing the saved predictor.

        Returns:
            A new OctoPredictor instance that can predict without the
            original study directory.
        """
        load_dir = UPath(path)

        with (load_dir / "metadata.json").open() as f:
            metadata_dict = json.load(f)

        version_path = load_dir / "version.json"
        if version_path.exists():
            with version_path.open() as f:
                version_info = json.load(f)
            saved_version = version_info.get("octopus_version", "unknown")
            current_version = get_version()
            if saved_version not in ("unknown", current_version):
                import warnings  # noqa: PLC0415

                warnings.warn(
                    f"Predictor was saved with octopus {saved_version}, "
                    f"but current version is {current_version}. "
                    f"Predictions may differ.",
                    stacklevel=2,
                )

        instance = OctoPredictor.__new__(OctoPredictor)

        outersplits = metadata_dict.get("outersplits", [])
        try:
            instance._study_info = StudyInfo(
                path=UPath(load_dir),
                n_outer_splits=len(outersplits),
                workflow_tasks=(),
                outersplit_dirs=(),
                ml_type=MLType(metadata_dict["ml_type"]),
                target_metric=metadata_dict["target_metric"],
                target_col=metadata_dict["target_col"],
                target_assignments=metadata_dict.get("target_assignments", {}),
                positive_class=metadata_dict.get("positive_class"),
                row_id_col=metadata_dict.get("row_id_col"),
                feature_cols=metadata_dict.get("feature_cols", []),
                outersplit_ids=tuple(outersplits),
            )
            instance._result_type = ResultType(metadata_dict.get("result_type", "best"))
            instance._task_id = metadata_dict["task_id"]
        except (KeyError, ValueError) as e:
            raise ValueError(
                f"Saved predictor metadata is incomplete or corrupted: {e}. "
                f"Re-save the predictor or check {load_dir / 'metadata.json'}."
            ) from e
        instance._feature_cols = metadata_dict.get("feature_cols", [])
        instance._feature_cols_per_split = {
            int(k): v for k, v in metadata_dict.get("feature_cols_per_split", {}).items()
        }
        instance._feature_groups_per_split = {
            int(k): v for k, v in metadata_dict.get("feature_groups_per_split", {}).items()
        }

        instance._models = {}
        models_dir = load_dir / "models"
        for split_id in outersplits:
            instance._models[split_id] = joblib_load(models_dir / f"model_{split_id:03d}.joblib")

        return instance

calculate_fi(data, fi_type=FIType.PERMUTATION, *, n_repeats=10, feature_groups=None, random_state=42, **kwargs)

Calculate feature importance on provided data across all outer splits.

Computes FI fresh from loaded models, providing p-values, confidence intervals, and group permutation support.

Parameters:

Name Type Description Default
data DataFrame

Data to compute FI on (must contain features + target).

required
fi_type FIType

Type of feature importance. One of: - FIType.PERMUTATION — Per-feature permutation importance. - FIType.GROUP_PERMUTATION — Per-feature + per-group permutation importance. Uses feature_groups (from study config or explicitly provided) to also compute group-level importance. - FIType.SHAP — SHAP-based importance. Pass shap_type as a kwarg to select the explainer: "kernel" (default), "permutation", or "exact".

PERMUTATION
n_repeats int

Number of permutation repeats (for permutation FI).

10
feature_groups dict[str, list[str]] | None

Dict mapping group names to feature lists (for group_permutation). If None and fi_type is FIType.GROUP_PERMUTATION, groups are loaded from the study.

None
random_state int

Random seed.

42
**kwargs Any

Additional keyword arguments passed to the FI function. For fi_type=FIType.SHAP, supported kwargs include: shap_type ("kernel", "permutation", "exact"), max_samples, background_size.

{}

Returns:

Type Description
DataFrame

DataFrame with feature importance results including a fi_type

DataFrame

column and per-split + ensemble rows.

Raises:

Type Description
ValueError

If fi_type is unknown.

Source code in octopus/poststudy/predict/predictor.py
def calculate_fi(
    self,
    data: pd.DataFrame,
    fi_type: FIType = FIType.PERMUTATION,
    *,
    n_repeats: int = 10,
    feature_groups: dict[str, list[str]] | None = None,
    random_state: int = 42,
    **kwargs: Any,
) -> pd.DataFrame:
    """Calculate feature importance on provided data across all outer splits.

    Computes FI fresh from loaded models, providing p-values,
    confidence intervals, and group permutation support.

    Args:
        data: Data to compute FI on (must contain features + target).
        fi_type: Type of feature importance. One of:
            - ``FIType.PERMUTATION`` — Per-feature permutation importance.
            - ``FIType.GROUP_PERMUTATION`` — Per-feature + per-group permutation
              importance.  Uses ``feature_groups`` (from study config or
              explicitly provided) to also compute group-level importance.
            - ``FIType.SHAP`` — SHAP-based importance.  Pass ``shap_type`` as a
              kwarg to select the explainer: ``"kernel"`` (default),
              ``"permutation"``, or ``"exact"``.
        n_repeats: Number of permutation repeats (for permutation FI).
        feature_groups: Dict mapping group names to feature lists
            (for group_permutation).  If None and fi_type is
            ``FIType.GROUP_PERMUTATION``, groups are loaded from the study.
        random_state: Random seed.
        **kwargs: Additional keyword arguments passed to the FI function.
            For ``fi_type=FIType.SHAP``, supported kwargs include:
            ``shap_type`` (``"kernel"``, ``"permutation"``, ``"exact"``),
            ``max_samples``, ``background_size``.

    Returns:
        DataFrame with feature importance results including a ``fi_type``
        column and per-split + ensemble rows.

    Raises:
        ValueError: If fi_type is unknown.
    """
    fi_type = FIType(fi_type)

    # All splits share the same DataFrame reference.  Safe because
    # compute_permutation_single / compute_shap_single copy data before mutating.
    test_data = dict.fromkeys(self._study_info.outersplits, data)
    train_data = self._build_pool_data(data)

    return self._dispatch_fi(
        test_data,
        train_data,
        fi_type,
        n_repeats=n_repeats,
        feature_groups=feature_groups,
        random_state=random_state,
        **kwargs,
    )

load(path) classmethod

Load a previously saved predictor.

Parameters:

Name Type Description Default
path str | UPath

Directory path containing the saved predictor.

required

Returns:

Type Description
OctoPredictor

A new OctoPredictor instance that can predict without the

OctoPredictor

original study directory.

Source code in octopus/poststudy/predict/predictor.py
@classmethod
def load(cls, path: str | UPath) -> OctoPredictor:
    """Load a previously saved predictor.

    Args:
        path: Directory path containing the saved predictor.

    Returns:
        A new OctoPredictor instance that can predict without the
        original study directory.
    """
    load_dir = UPath(path)

    with (load_dir / "metadata.json").open() as f:
        metadata_dict = json.load(f)

    version_path = load_dir / "version.json"
    if version_path.exists():
        with version_path.open() as f:
            version_info = json.load(f)
        saved_version = version_info.get("octopus_version", "unknown")
        current_version = get_version()
        if saved_version not in ("unknown", current_version):
            import warnings  # noqa: PLC0415

            warnings.warn(
                f"Predictor was saved with octopus {saved_version}, "
                f"but current version is {current_version}. "
                f"Predictions may differ.",
                stacklevel=2,
            )

    instance = OctoPredictor.__new__(OctoPredictor)

    outersplits = metadata_dict.get("outersplits", [])
    try:
        instance._study_info = StudyInfo(
            path=UPath(load_dir),
            n_outer_splits=len(outersplits),
            workflow_tasks=(),
            outersplit_dirs=(),
            ml_type=MLType(metadata_dict["ml_type"]),
            target_metric=metadata_dict["target_metric"],
            target_col=metadata_dict["target_col"],
            target_assignments=metadata_dict.get("target_assignments", {}),
            positive_class=metadata_dict.get("positive_class"),
            row_id_col=metadata_dict.get("row_id_col"),
            feature_cols=metadata_dict.get("feature_cols", []),
            outersplit_ids=tuple(outersplits),
        )
        instance._result_type = ResultType(metadata_dict.get("result_type", "best"))
        instance._task_id = metadata_dict["task_id"]
    except (KeyError, ValueError) as e:
        raise ValueError(
            f"Saved predictor metadata is incomplete or corrupted: {e}. "
            f"Re-save the predictor or check {load_dir / 'metadata.json'}."
        ) from e
    instance._feature_cols = metadata_dict.get("feature_cols", [])
    instance._feature_cols_per_split = {
        int(k): v for k, v in metadata_dict.get("feature_cols_per_split", {}).items()
    }
    instance._feature_groups_per_split = {
        int(k): v for k, v in metadata_dict.get("feature_groups_per_split", {}).items()
    }

    instance._models = {}
    models_dir = load_dir / "models"
    for split_id in outersplits:
        instance._models[split_id] = joblib_load(models_dir / f"model_{split_id:03d}.joblib")

    return instance

performance(data, metrics=None, threshold=0.5)

Compute performance on provided data per outersplit with Mean and Ensemble.

Each outer-split model is scored independently on the same data. The Mean row averages per-split scores. The Ensemble row scores the ensemble-averaged predictions against ground truth.

Parameters:

Name Type Description Default
data DataFrame

Data to score on; must contain feature columns + target column.

required
metrics list[str] | None

List of metric names to compute. If None, auto-detected from the ML type.

None
threshold float

Classification threshold for threshold-dependent metrics.

0.5

Returns:

Type Description
DataFrame

Wide DataFrame with outersplit IDs as index (plus Mean and

DataFrame

Ensemble), metrics as columns.

Source code in octopus/poststudy/predict/predictor.py
def performance(
    self,
    data: pd.DataFrame,
    metrics: list[str] | None = None,
    threshold: float = 0.5,
) -> pd.DataFrame:
    """Compute performance on provided data per outersplit with Mean and Ensemble.

    Each outer-split model is scored independently on the **same** data.
    The ``Mean`` row averages per-split scores.  The ``Ensemble`` row
    scores the ensemble-averaged predictions against ground truth.

    Args:
        data: Data to score on; must contain feature columns + target column.
        metrics: List of metric names to compute.
            If None, auto-detected from the ML type.
        threshold: Classification threshold for threshold-dependent metrics.

    Returns:
        Wide DataFrame with outersplit IDs as index (plus ``Mean`` and
        ``Ensemble``), metrics as columns.
    """
    metrics = self._resolve_metrics(metrics)
    data_per_split = dict.fromkeys(self._study_info.outersplits, data)
    df = self._compute_per_split_scores(data_per_split, metrics, threshold)
    df.loc["Mean"] = df.mean()

    pred_df = self.predict(data)
    pred_with_target = pd.DataFrame(
        {
            "prediction": pred_df["ensemble"].values,
            **self._get_target_columns(data),
        }
    )
    proba_df = self.predict_proba(data) if self._needs_proba(metrics) else None
    df.loc["Ensemble"] = self._compute_summary_scores(pred_with_target, proba_df, metrics, threshold)
    return df

predict(data)

Predict on new data using all outer-split models.

Return a wide-format DataFrame with one row per sample. Columns: row_id, split_0, split_1, ..., ensemble.

For regression and time-to-event, the ensemble column is the arithmetic mean of per-split predictions. For classification, it contains class labels derived from the argmax of ensemble-averaged probabilities.

Parameters:

Name Type Description Default
data DataFrame

DataFrame containing feature columns.

required

Returns:

Type Description
DataFrame

Wide-format DataFrame with per-split and ensemble predictions.

Source code in octopus/poststudy/predict/predictor.py
def predict(self, data: pd.DataFrame) -> pd.DataFrame:
    """Predict on new data using all outer-split models.

    Return a wide-format DataFrame with one row per sample.
    Columns: ``row_id``, ``split_0``, ``split_1``, ..., ``ensemble``.

    For regression and time-to-event, the ``ensemble`` column is the
    arithmetic mean of per-split predictions.  For classification, it
    contains class labels derived from the argmax of ensemble-averaged
    probabilities.

    Args:
        data: DataFrame containing feature columns.

    Returns:
        Wide-format DataFrame with per-split and ensemble predictions.
    """
    per_split_preds = {sid: self._predict_raw(sid, data) for sid in self._study_info.outersplits}

    result = pd.DataFrame({"row_id": self._get_row_ids(data)})
    for split_id, preds in per_split_preds.items():
        result[f"split_{split_id}"] = preds

    if self._study_info.ml_type in (MLType.BINARY, MLType.MULTICLASS):
        per_split_probas = {sid: self._predict_proba_raw(sid, data) for sid in self._study_info.outersplits}
        avg_proba = np.mean(list(per_split_probas.values()), axis=0)
        result["ensemble"] = self.classes_[np.argmax(avg_proba, axis=1)]
    else:
        result["ensemble"] = np.mean(list(per_split_preds.values()), axis=0)

    return result

predict_proba(data)

Predict probabilities on new data (classification/multiclass only).

Return a wide-format DataFrame with one row per sample. Columns: row_id, one column per class label (ensemble-averaged), then <class>_split_0, <class>_split_1, ... for per-split detail.

Parameters:

Name Type Description Default
data DataFrame

DataFrame containing feature columns.

required

Returns:

Type Description
DataFrame

Wide-format DataFrame with ensemble and per-split probabilities.

Raises:

Type Description
TypeError

If ml_type is not classification or multiclass.

Source code in octopus/poststudy/predict/predictor.py
def predict_proba(self, data: pd.DataFrame) -> pd.DataFrame:
    """Predict probabilities on new data (classification/multiclass only).

    Return a wide-format DataFrame with one row per sample.
    Columns: ``row_id``, one column per class label (ensemble-averaged),
    then ``<class>_split_0``, ``<class>_split_1``, ... for per-split detail.

    Args:
        data: DataFrame containing feature columns.

    Returns:
        Wide-format DataFrame with ensemble and per-split probabilities.

    Raises:
        TypeError: If ml_type is not classification or multiclass.
    """
    self._check_classification_only("predict_proba")
    per_split_probas = {sid: self._predict_proba_raw(sid, data) for sid in self._study_info.outersplits}
    class_labels = self.classes_
    ensemble: np.ndarray = np.mean(list(per_split_probas.values()), axis=0)

    result = pd.DataFrame({"row_id": self._get_row_ids(data)})
    for i, label in enumerate(class_labels):
        result[label] = ensemble[:, i]
    for split_id, probas in per_split_probas.items():
        for i, label in enumerate(class_labels):
            result[f"{label}_split_{split_id}"] = probas[:, i]
    return result

save(path)

Save the predictor for standalone deployment.

Writes a self-contained directory with models + metadata only (no data). The saved predictor can be loaded later without the original study directory.

Parameters:

Name Type Description Default
path str | UPath

Directory path to save to. Created if it doesn't exist.

required
Source code in octopus/poststudy/predict/predictor.py
def save(self, path: str | UPath) -> None:
    """Save the predictor for standalone deployment.

    Writes a self-contained directory with models + metadata only
    (no data). The saved predictor can be loaded later without the
    original study directory.

    Args:
        path: Directory path to save to. Created if it doesn't exist.
    """
    save_dir = UPath(path)
    save_dir.mkdir(parents=True, exist_ok=True)

    si = self._study_info
    metadata = {
        "task_id": self._task_id,
        "ml_type": si.ml_type,
        "target_metric": si.target_metric,
        "target_col": si.target_col,
        "target_assignments": si.target_assignments,
        "positive_class": si.positive_class,
        "row_id_col": si.row_id_col,
        "feature_cols": self._feature_cols,
        "outersplits": si.outersplits,
        "result_type": self._result_type,
        "feature_cols_per_split": {str(k): v for k, v in self._feature_cols_per_split.items()},
        "feature_groups_per_split": {str(k): v for k, v in self._feature_groups_per_split.items()},
    }
    with (save_dir / "metadata.json").open("w") as f:
        json.dump(metadata, f, indent=2, default=str)

    models_dir = save_dir / "models"
    models_dir.mkdir(parents=True, exist_ok=True)
    for split_id in si.outersplits:
        joblib_save(self._models[split_id], models_dir / f"model_{split_id:03d}.joblib")

    with (save_dir / "version.json").open("w") as f:
        json.dump({"octopus_version": get_version()}, f, indent=2)

OctoTestEvaluator

Bases: _PredictorBase

Predictor for analysing study results on held-out test data.

Stores test and train data per outer split. Methods use stored test data implicitly — the caller never needs to pass data.

Each outer-split model predicts only on its corresponding test data. No averaging across splits.

Parameters:

Name Type Description Default
study_info

StudyInfo returned by load_study_information().

required
task_id

Concrete workflow task index (must be >= 0).

required
result_type

Result type for filtering results (default: 'best').

required

Raises:

Type Description
ValueError

If task_id is negative, out of range, or no models found.

FileNotFoundError

If expected study artifacts are missing.

Example

info = load_study_information("studies/my_study") tp = OctoTestEvaluator(study_info=info, task_id=0) scores = tp.performance(metrics=["AUCROC", "ACC"])

Source code in octopus/poststudy/analysis/evaluator.py
@define(slots=False)
class OctoTestEvaluator(_PredictorBase):
    """Predictor for analysing study results on held-out test data.

    Stores test and train data per outer split.  Methods use stored test
    data implicitly — the caller never needs to pass data.

    Each outer-split model predicts **only** on its corresponding test data.
    No averaging across splits.

    Args:
        study_info: ``StudyInfo`` returned by ``load_study_information()``.
        task_id: Concrete workflow task index (must be >= 0).
        result_type: Result type for filtering results (default: 'best').

    Raises:
        ValueError: If task_id is negative, out of range, or no models found.
        FileNotFoundError: If expected study artifacts are missing.

    Example:
        >>> info = load_study_information("studies/my_study")
        >>> tp = OctoTestEvaluator(study_info=info, task_id=0)
        >>> scores = tp.performance(metrics=["AUCROC", "ACC"])
    """

    _test_data: dict[int, pd.DataFrame] = field(init=False, factory=dict, repr=False)
    _train_data: dict[int, pd.DataFrame] = field(init=False, factory=dict, repr=False)

    def __attrs_post_init__(self) -> None:
        """Load base artifacts via parent, then additionally load test/train data."""
        super().__attrs_post_init__()

        from octopus.poststudy.study_io import load_partition  # noqa: PLC0415

        prepared_data = parquet_load(self._study_info.path / "data_prepared.parquet")

        for split_id in self._study_info.outersplits:
            self._train_data[split_id] = load_partition(self._study_info.path, split_id, "traindev", prepared_data)
            self._test_data[split_id] = load_partition(self._study_info.path, split_id, "test", prepared_data)

    def predict(self) -> pd.DataFrame:
        """Predict on stored test data.  Each model predicts only on its own test data.

        No ensemble averaging — results are collected per split.

        Returns:
            DataFrame with columns: ``outersplit``, ``row_id``, ``prediction``,
            and target column(s).  For T2E tasks the target columns are
            ``target_duration`` and ``target_event`` instead of ``target``.
        """
        all_rows: list[pd.DataFrame] = []

        for split_id in self._study_info.outersplits:
            test = self._test_data[split_id]
            preds = self._predict_raw(split_id, test)

            split_df = pd.DataFrame(
                {
                    "outersplit": split_id,
                    "row_id": self._get_row_ids(test),
                    "prediction": preds,
                    **self._get_target_columns(test),
                }
            )
            all_rows.append(split_df)

        return pd.concat(all_rows, ignore_index=True)

    def predict_proba(self) -> pd.DataFrame:
        """Predict probabilities on stored test data (classification/multiclass only).

        Each model predicts only on its own test data.  No averaging.

        Returns:
            DataFrame with columns: ``outersplit``, ``row_id``, one probability
            column per class label, and target column(s).

        Raises:
            TypeError: If ml_type is not classification or multiclass.
        """
        self._check_classification_only("predict_proba")
        class_labels = self.classes_
        all_rows: list[pd.DataFrame] = []

        for split_id in self._study_info.outersplits:
            test = self._test_data[split_id]
            probas = self._predict_proba_raw(split_id, test)

            split_df = pd.DataFrame(probas, columns=class_labels)
            split_df.insert(0, "outersplit", split_id)
            split_df.insert(1, "row_id", self._get_row_ids(test))
            for col_name, col_values in self._get_target_columns(test).items():
                split_df[col_name] = col_values
            all_rows.append(split_df)

        return pd.concat(all_rows, ignore_index=True)

    def performance(
        self,
        metrics: list[str] | None = None,
        threshold: float = 0.5,
    ) -> pd.DataFrame:
        """Compute performance on stored test data per outersplit with Mean and Merged.

        Each outer-split model is scored **only on its own test data**.
        The ``Mean`` row averages per-split scores.  The ``Merged`` row
        pools all test predictions and scores them as one set.

        Args:
            metrics: List of metric names to compute.
                If None, auto-detected from the ML type.
            threshold: Classification threshold for threshold-dependent metrics.

        Returns:
            Wide DataFrame with outersplit IDs as index (plus ``Mean`` and
            ``Merged``), metrics as columns.
        """
        metrics = self._resolve_metrics(metrics)
        df = self._compute_per_split_scores(self._test_data, metrics, threshold)
        df.loc["Mean"] = df.mean()

        pred_df = self.predict()
        proba_df = self.predict_proba() if self._needs_proba(metrics) else None
        df.loc["Merged"] = self._compute_summary_scores(pred_df, proba_df, metrics, threshold)
        return df

    def calculate_fi(
        self,
        fi_type: FIType = FIType.PERMUTATION,
        *,
        n_repeats: int = 10,
        feature_groups: dict[str, list[str]] | None = None,
        random_state: int = 42,
        **kwargs: Any,
    ) -> pd.DataFrame:
        """Calculate feature importance using stored test data and models.

        Each split's model permutes features only in its own test data.
        Delegates to ``_dispatch_fi()`` (inherited from ``_PredictorBase``)
        with stored per-split test and train data.

        Args:
            fi_type: Type of feature importance. One of:
                - ``FIType.PERMUTATION`` — Per-feature permutation importance.
                - ``FIType.GROUP_PERMUTATION`` — Per-feature + per-group permutation
                  importance.  Uses ``feature_groups`` (from study config or
                  explicitly provided) to also compute group-level importance.
                - ``FIType.SHAP`` — SHAP-based importance.  Pass ``shap_type`` as a
                  kwarg to select the explainer: ``"kernel"`` (default),
                  ``"permutation"``, or ``"exact"``.
            n_repeats: Number of permutation repeats.
            feature_groups: Dict mapping group names to feature lists.
                If None and fi_type is ``FIType.GROUP_PERMUTATION``, groups are
                loaded from the study.
            random_state: Random seed.
            **kwargs: Additional keyword arguments passed to the FI function.
                For ``fi_type=FIType.SHAP``, supported kwargs include:
                ``shap_type`` (``"kernel"``, ``"permutation"``,
                ``"exact"``),
                ``max_samples``, ``background_size``.

        Returns:
            DataFrame with feature importance results including a ``fi_type``
            column and per-split + ensemble rows.

        Raises:
            ValueError: If fi_type is unknown.
        """
        fi_type = FIType(fi_type)

        return self._dispatch_fi(
            self._test_data,
            self._train_data,
            fi_type,
            n_repeats=n_repeats,
            feature_groups=feature_groups,
            random_state=random_state,
            **kwargs,
        )

__attrs_post_init__()

Load base artifacts via parent, then additionally load test/train data.

Source code in octopus/poststudy/analysis/evaluator.py
def __attrs_post_init__(self) -> None:
    """Load base artifacts via parent, then additionally load test/train data."""
    super().__attrs_post_init__()

    from octopus.poststudy.study_io import load_partition  # noqa: PLC0415

    prepared_data = parquet_load(self._study_info.path / "data_prepared.parquet")

    for split_id in self._study_info.outersplits:
        self._train_data[split_id] = load_partition(self._study_info.path, split_id, "traindev", prepared_data)
        self._test_data[split_id] = load_partition(self._study_info.path, split_id, "test", prepared_data)

calculate_fi(fi_type=FIType.PERMUTATION, *, n_repeats=10, feature_groups=None, random_state=42, **kwargs)

Calculate feature importance using stored test data and models.

Each split's model permutes features only in its own test data. Delegates to _dispatch_fi() (inherited from _PredictorBase) with stored per-split test and train data.

Parameters:

Name Type Description Default
fi_type FIType

Type of feature importance. One of: - FIType.PERMUTATION — Per-feature permutation importance. - FIType.GROUP_PERMUTATION — Per-feature + per-group permutation importance. Uses feature_groups (from study config or explicitly provided) to also compute group-level importance. - FIType.SHAP — SHAP-based importance. Pass shap_type as a kwarg to select the explainer: "kernel" (default), "permutation", or "exact".

PERMUTATION
n_repeats int

Number of permutation repeats.

10
feature_groups dict[str, list[str]] | None

Dict mapping group names to feature lists. If None and fi_type is FIType.GROUP_PERMUTATION, groups are loaded from the study.

None
random_state int

Random seed.

42
**kwargs Any

Additional keyword arguments passed to the FI function. For fi_type=FIType.SHAP, supported kwargs include: shap_type ("kernel", "permutation", "exact"), max_samples, background_size.

{}

Returns:

Type Description
DataFrame

DataFrame with feature importance results including a fi_type

DataFrame

column and per-split + ensemble rows.

Raises:

Type Description
ValueError

If fi_type is unknown.

Source code in octopus/poststudy/analysis/evaluator.py
def calculate_fi(
    self,
    fi_type: FIType = FIType.PERMUTATION,
    *,
    n_repeats: int = 10,
    feature_groups: dict[str, list[str]] | None = None,
    random_state: int = 42,
    **kwargs: Any,
) -> pd.DataFrame:
    """Calculate feature importance using stored test data and models.

    Each split's model permutes features only in its own test data.
    Delegates to ``_dispatch_fi()`` (inherited from ``_PredictorBase``)
    with stored per-split test and train data.

    Args:
        fi_type: Type of feature importance. One of:
            - ``FIType.PERMUTATION`` — Per-feature permutation importance.
            - ``FIType.GROUP_PERMUTATION`` — Per-feature + per-group permutation
              importance.  Uses ``feature_groups`` (from study config or
              explicitly provided) to also compute group-level importance.
            - ``FIType.SHAP`` — SHAP-based importance.  Pass ``shap_type`` as a
              kwarg to select the explainer: ``"kernel"`` (default),
              ``"permutation"``, or ``"exact"``.
        n_repeats: Number of permutation repeats.
        feature_groups: Dict mapping group names to feature lists.
            If None and fi_type is ``FIType.GROUP_PERMUTATION``, groups are
            loaded from the study.
        random_state: Random seed.
        **kwargs: Additional keyword arguments passed to the FI function.
            For ``fi_type=FIType.SHAP``, supported kwargs include:
            ``shap_type`` (``"kernel"``, ``"permutation"``,
            ``"exact"``),
            ``max_samples``, ``background_size``.

    Returns:
        DataFrame with feature importance results including a ``fi_type``
        column and per-split + ensemble rows.

    Raises:
        ValueError: If fi_type is unknown.
    """
    fi_type = FIType(fi_type)

    return self._dispatch_fi(
        self._test_data,
        self._train_data,
        fi_type,
        n_repeats=n_repeats,
        feature_groups=feature_groups,
        random_state=random_state,
        **kwargs,
    )

performance(metrics=None, threshold=0.5)

Compute performance on stored test data per outersplit with Mean and Merged.

Each outer-split model is scored only on its own test data. The Mean row averages per-split scores. The Merged row pools all test predictions and scores them as one set.

Parameters:

Name Type Description Default
metrics list[str] | None

List of metric names to compute. If None, auto-detected from the ML type.

None
threshold float

Classification threshold for threshold-dependent metrics.

0.5

Returns:

Type Description
DataFrame

Wide DataFrame with outersplit IDs as index (plus Mean and

DataFrame

Merged), metrics as columns.

Source code in octopus/poststudy/analysis/evaluator.py
def performance(
    self,
    metrics: list[str] | None = None,
    threshold: float = 0.5,
) -> pd.DataFrame:
    """Compute performance on stored test data per outersplit with Mean and Merged.

    Each outer-split model is scored **only on its own test data**.
    The ``Mean`` row averages per-split scores.  The ``Merged`` row
    pools all test predictions and scores them as one set.

    Args:
        metrics: List of metric names to compute.
            If None, auto-detected from the ML type.
        threshold: Classification threshold for threshold-dependent metrics.

    Returns:
        Wide DataFrame with outersplit IDs as index (plus ``Mean`` and
        ``Merged``), metrics as columns.
    """
    metrics = self._resolve_metrics(metrics)
    df = self._compute_per_split_scores(self._test_data, metrics, threshold)
    df.loc["Mean"] = df.mean()

    pred_df = self.predict()
    proba_df = self.predict_proba() if self._needs_proba(metrics) else None
    df.loc["Merged"] = self._compute_summary_scores(pred_df, proba_df, metrics, threshold)
    return df

predict()

Predict on stored test data. Each model predicts only on its own test data.

No ensemble averaging — results are collected per split.

Returns:

Type Description
DataFrame

DataFrame with columns: outersplit, row_id, prediction,

DataFrame

and target column(s). For T2E tasks the target columns are

DataFrame

target_duration and target_event instead of target.

Source code in octopus/poststudy/analysis/evaluator.py
def predict(self) -> pd.DataFrame:
    """Predict on stored test data.  Each model predicts only on its own test data.

    No ensemble averaging — results are collected per split.

    Returns:
        DataFrame with columns: ``outersplit``, ``row_id``, ``prediction``,
        and target column(s).  For T2E tasks the target columns are
        ``target_duration`` and ``target_event`` instead of ``target``.
    """
    all_rows: list[pd.DataFrame] = []

    for split_id in self._study_info.outersplits:
        test = self._test_data[split_id]
        preds = self._predict_raw(split_id, test)

        split_df = pd.DataFrame(
            {
                "outersplit": split_id,
                "row_id": self._get_row_ids(test),
                "prediction": preds,
                **self._get_target_columns(test),
            }
        )
        all_rows.append(split_df)

    return pd.concat(all_rows, ignore_index=True)

predict_proba()

Predict probabilities on stored test data (classification/multiclass only).

Each model predicts only on its own test data. No averaging.

Returns:

Type Description
DataFrame

DataFrame with columns: outersplit, row_id, one probability

DataFrame

column per class label, and target column(s).

Raises:

Type Description
TypeError

If ml_type is not classification or multiclass.

Source code in octopus/poststudy/analysis/evaluator.py
def predict_proba(self) -> pd.DataFrame:
    """Predict probabilities on stored test data (classification/multiclass only).

    Each model predicts only on its own test data.  No averaging.

    Returns:
        DataFrame with columns: ``outersplit``, ``row_id``, one probability
        column per class label, and target column(s).

    Raises:
        TypeError: If ml_type is not classification or multiclass.
    """
    self._check_classification_only("predict_proba")
    class_labels = self.classes_
    all_rows: list[pd.DataFrame] = []

    for split_id in self._study_info.outersplits:
        test = self._test_data[split_id]
        probas = self._predict_proba_raw(split_id, test)

        split_df = pd.DataFrame(probas, columns=class_labels)
        split_df.insert(0, "outersplit", split_id)
        split_df.insert(1, "row_id", self._get_row_ids(test))
        for col_name, col_values in self._get_target_columns(test).items():
            split_df[col_name] = col_values
        all_rows.append(split_df)

    return pd.concat(all_rows, ignore_index=True)

StudyInfo

Validated, immutable view of a completed study directory.

Returned by load_study_information(). Accepted by both analysis functions and predictor constructors. Does NOT store the raw config dict — all values are typed extractions.

Source code in octopus/poststudy/study_io.py
@frozen
class StudyInfo:
    """Validated, immutable view of a completed study directory.

    Returned by ``load_study_information()``. Accepted by both analysis
    functions and predictor constructors. Does NOT store the raw config
    dict — all values are typed extractions.
    """

    path: UPath
    n_outer_splits: int
    workflow_tasks: tuple[dict[str, Any], ...]
    outersplit_dirs: tuple[UPath, ...]
    ml_type: MLType
    target_metric: str
    target_col: str
    target_assignments: dict[str, str]
    positive_class: Any
    row_id_col: str | None
    feature_cols: list[str]
    _outersplit_ids: tuple[int, ...] | None = field(default=None, alias="outersplit_ids")

    @property
    def outersplits(self) -> list[int]:
        """Outersplit IDs.

        Resolution order:
        1. Explicit ``outersplit_ids`` (set by ``OctoPredictor.load()``).
        2. Derived from ``outersplit_dirs`` (normal study path).
        3. ``range(n_outer_splits)`` (last resort).
        """
        if self._outersplit_ids is not None:
            return list(self._outersplit_ids)
        if self.outersplit_dirs:
            return [int(d.name.replace("outersplit", "")) for d in self.outersplit_dirs]
        return list(range(self.n_outer_splits))

    @property
    def n_outersplits(self) -> int:
        """Number of outer splits."""
        if self._outersplit_ids is not None:
            return len(self._outersplit_ids)
        if self.outersplit_dirs:
            return len(self.outersplit_dirs)
        return self.n_outer_splits

n_outersplits property

Number of outer splits.

outersplits property

Outersplit IDs.

Resolution order: 1. Explicit outersplit_ids (set by OctoPredictor.load()). 2. Derived from outersplit_dirs (normal study path). 3. range(n_outer_splits) (last resort).

load_study_information(study_directory)

Load and validate a study directory.

Reads study_config.json, discovers outersplit directories, validates structure, and extracts typed metadata into a frozen StudyInfo.

Parameters:

Name Type Description Default
study_directory str | UPath

Path to the study directory.

required

Returns:

Type Description
StudyInfo

Frozen StudyInfo with validated study metadata.

Raises:

Type Description
ValueError

If no outersplit directories are found.

FileNotFoundError

If the study directory or config does not exist.

Source code in octopus/poststudy/study_io.py
def load_study_information(study_directory: str | UPath) -> StudyInfo:
    """Load and validate a study directory.

    Reads ``study_config.json``, discovers outersplit directories,
    validates structure, and extracts typed metadata into a frozen
    ``StudyInfo``.

    Args:
        study_directory: Path to the study directory.

    Returns:
        Frozen ``StudyInfo`` with validated study metadata.

    Raises:
        ValueError: If no outersplit directories are found.
        FileNotFoundError: If the study directory or config does not exist.
    """
    study_path = UPath(study_directory)
    if not study_path.exists():
        raise FileNotFoundError(f"Study directory not found: {study_path}")

    config = load_study_config(study_path)

    n_outer_splits = config["n_outer_splits"]
    workflow_tasks = config["workflow"]

    outersplit_dirs = sorted(
        [d for d in study_path.glob("outersplit*") if d.is_dir()],
        key=lambda x: int(x.name.replace("outersplit", "")),
    )
    if not outersplit_dirs:
        raise ValueError(
            f"No outersplit directories found in study path.\n"
            f"Study path: {study_path}\nThe study may not have been run yet."
        )

    missing_outersplits = [i for i in range(n_outer_splits) if not (study_path / f"outersplit{i}").exists()]
    if missing_outersplits:
        warnings.warn(
            f"Missing outersplit directories: {missing_outersplits}\nStudy path: {study_path}",
            stacklevel=2,
        )

    task_ids = [t["task_id"] for t in workflow_tasks]
    missing_task_dirs = [
        f"{d.name}/task{tid}" for d in outersplit_dirs for tid in task_ids if not (d / f"task{tid}").exists()
    ]
    if missing_task_dirs:
        warnings.warn(
            f"Missing workflow task directories: {missing_task_dirs}\nStudy path: {study_path}",
            stacklevel=2,
        )

    prepared = config.get("prepared", {})

    return StudyInfo(
        path=study_path,
        n_outer_splits=n_outer_splits,
        workflow_tasks=tuple(workflow_tasks),
        outersplit_dirs=tuple(outersplit_dirs),
        ml_type=MLType(config["ml_type"]),
        target_metric=config.get("target_metric", ""),
        target_col=config.get("target_col", ""),
        target_assignments=prepared.get("target_assignments", {}),
        positive_class=config.get("positive_class"),
        row_id_col=prepared.get("row_id_col"),
        feature_cols=prepared.get("feature_cols", []),
    )