Skip to content

octopus.modules

Init modules.

AutoGluon

AutoGluon module placeholder when AutoGluon is not installed.

Source code in octopus/modules/__init__.py
class AutoGluon:  # type: ignore[no-redef]
    """AutoGluon module placeholder when AutoGluon is not installed."""

    def __init__(self, *args, **kwargs):
        raise ImportError(
            "AutoGluon is not installed. Please install it with `pip install octopus[autogluon]` to use this module."
        )

Boruta

Bases: Task

Boruta module for feature selection.

Uses the Boruta algorithm to identify all relevant features by comparing importance scores with shadow features.

Configuration

model: Model to use for Boruta (defaults based on ml_type) cv: Number of CV folds perc: Percentile threshold for shadow feature comparison alpha: Significance level for p-values

Source code in octopus/modules/boruta/module.py
@define
class Boruta(Task):
    """Boruta module for feature selection.

    Uses the Boruta algorithm to identify all relevant features by comparing
    importance scores with shadow features.

    Configuration:
        model: Model to use for Boruta (defaults based on ml_type)
        cv: Number of CV folds
        perc: Percentile threshold for shadow feature comparison
        alpha: Significance level for p-values
    """

    model: str = field(validator=[validators.instance_of(str)], default="")
    """Model used by Boruta."""

    cv: int = field(validator=[validators.instance_of(int)], default=5)
    """Number of folds for CV."""

    perc: int = field(validator=[validators.instance_of(int)], default=100)
    """Percentile (threshold) for comparison between shadow and real features."""

    alpha: float = field(validator=[validators.instance_of(float)], default=0.05)
    """Level at which the corrected p-values will get rejected."""

    def create_module(self) -> ModuleExecution:
        """Create BorutaModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import BorutaModule  # noqa: PLC0415

        return BorutaModule(config=self)

alpha = field(validator=[validators.instance_of(float)], default=0.05) class-attribute instance-attribute

Level at which the corrected p-values will get rejected.

cv = field(validator=[validators.instance_of(int)], default=5) class-attribute instance-attribute

Number of folds for CV.

model = field(validator=[validators.instance_of(str)], default='') class-attribute instance-attribute

Model used by Boruta.

perc = field(validator=[validators.instance_of(int)], default=100) class-attribute instance-attribute

Percentile (threshold) for comparison between shadow and real features.

create_module()

Create BorutaModule execution instance.

Source code in octopus/modules/boruta/module.py
def create_module(self) -> ModuleExecution:
    """Create BorutaModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import BorutaModule  # noqa: PLC0415

    return BorutaModule(config=self)

DataPartition

Bases: StrEnum

Dataset partitions for feature importance computation.

Source code in octopus/types.py
class DataPartition(StrEnum):
    """Dataset partitions for feature importance computation."""

    TRAIN = "train"
    DEV = "dev"
    TEST = "test"

Efs

Bases: Task

EFS module for ensemble feature selection.

Creates multiple models on random feature subsets and uses ensemble optimization to select the best combination of models.

Configuration

model: Model to use for EFS (defaults to CatBoost based on ml_type) subset_size: Number of features in each random subset n_subsets: Number of random subsets to create cv: Number of CV folds max_n_iterations: Maximum iterations for ensemble optimization max_n_models: Maximum number of models to consider

Source code in octopus/modules/efs/module.py
@define
class Efs(Task):
    """EFS module for ensemble feature selection.

    Creates multiple models on random feature subsets and uses ensemble
    optimization to select the best combination of models.

    Configuration:
        model: Model to use for EFS (defaults to CatBoost based on ml_type)
        subset_size: Number of features in each random subset
        n_subsets: Number of random subsets to create
        cv: Number of CV folds
        max_n_iterations: Maximum iterations for ensemble optimization
        max_n_models: Maximum number of models to consider
    """

    model: str = field(validator=[validators.instance_of(str)], default="")
    """Model used by EFS (empty string uses default for ml_type)."""

    subset_size: int = field(validator=[validators.instance_of(int)], default=30)
    """Number of features in the subset."""

    n_subsets: int = field(validator=[validators.instance_of(int)], default=100)
    """Number of subsets."""

    cv: int = field(validator=[validators.instance_of(int)], default=5)
    """Number of CV folds for EFS."""

    max_n_iterations: int = field(validator=[validators.instance_of(int)], default=50)
    """Number of iterations for ensemble optimization."""

    max_n_models: int = field(validator=[validators.instance_of(int)], default=30)
    """Maximum number of models used in optimization, pruning."""

    def create_module(self) -> ModuleExecution:
        """Create EfsModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import EfsModule  # noqa: PLC0415

        return EfsModule(config=self)

cv = field(validator=[validators.instance_of(int)], default=5) class-attribute instance-attribute

Number of CV folds for EFS.

max_n_iterations = field(validator=[validators.instance_of(int)], default=50) class-attribute instance-attribute

Number of iterations for ensemble optimization.

max_n_models = field(validator=[validators.instance_of(int)], default=30) class-attribute instance-attribute

Maximum number of models used in optimization, pruning.

model = field(validator=[validators.instance_of(str)], default='') class-attribute instance-attribute

Model used by EFS (empty string uses default for ml_type).

n_subsets = field(validator=[validators.instance_of(int)], default=100) class-attribute instance-attribute

Number of subsets.

subset_size = field(validator=[validators.instance_of(int)], default=30) class-attribute instance-attribute

Number of features in the subset.

create_module()

Create EfsModule execution instance.

Source code in octopus/modules/efs/module.py
def create_module(self) -> ModuleExecution:
    """Create EfsModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import EfsModule  # noqa: PLC0415

    return EfsModule(config=self)

FIResultLabel

Bases: StrEnum

Labels used in feature-importance result DataFrames.

Every module writes a fi_method column into its result DataFrame. Use these members as the column values so downstream code can filter and aggregate results reliably.

Source code in octopus/types.py
class FIResultLabel(StrEnum):
    """Labels used in feature-importance result DataFrames.

    Every module writes a ``fi_method`` column into its result DataFrame.
    Use these members as the column values so downstream code can filter
    and aggregate results reliably.
    """

    INTERNAL = "internal"
    PERMUTATION = "permutation"
    SHAP = "shap"
    LOFO = "lofo"
    CONSTANT = "constant"
    COUNTS = "counts"
    COUNTS_RELATIVE = "counts_relative"

ModuleExecution

Bases: ABC

Base execution class. Created on worker via config.create_module().

Source code in octopus/modules/base.py
@define
class ModuleExecution[T: Task](ABC):
    """Base execution class. Created on worker via config.create_module()."""

    config: T = field()

    @abstractmethod
    def fit(
        self,
        *,
        data_traindev: pd.DataFrame,
        data_test: pd.DataFrame,
        feature_cols: list[str],
        study_context: StudyContext,
        outersplit_id: int,
        results_dir: UPath,
        scratch_dir: UPath,
        num_assigned_cpus: int,
        feature_groups: dict[str, list[str]],
        prior_results: dict[str, pd.DataFrame],
    ) -> dict[ResultType, ModuleResult]:
        """Fit the module. Returns dict mapping ResultType to ModuleResult."""
        raise NotImplementedError("Subclasses must implement fit()")

    def is_fitted(self) -> bool:
        """Check if module has been fitted."""
        if hasattr(self, "selected_features_"):
            return self.selected_features_ is not None
        return False

fit(*, data_traindev, data_test, feature_cols, study_context, outersplit_id, results_dir, scratch_dir, num_assigned_cpus, feature_groups, prior_results) abstractmethod

Fit the module. Returns dict mapping ResultType to ModuleResult.

Source code in octopus/modules/base.py
@abstractmethod
def fit(
    self,
    *,
    data_traindev: pd.DataFrame,
    data_test: pd.DataFrame,
    feature_cols: list[str],
    study_context: StudyContext,
    outersplit_id: int,
    results_dir: UPath,
    scratch_dir: UPath,
    num_assigned_cpus: int,
    feature_groups: dict[str, list[str]],
    prior_results: dict[str, pd.DataFrame],
) -> dict[ResultType, ModuleResult]:
    """Fit the module. Returns dict mapping ResultType to ModuleResult."""
    raise NotImplementedError("Subclasses must implement fit()")

is_fitted()

Check if module has been fitted.

Source code in octopus/modules/base.py
def is_fitted(self) -> bool:
    """Check if module has been fitted."""
    if hasattr(self, "selected_features_"):
        return self.selected_features_ is not None
    return False

ModuleResult

Unified result container for a single result type from a module.

Carries all 5 artifacts (selected_features, scores, predictions, feature_importances, model) and knows how to save/load itself. Each result_type gets its own directory on disk.

Source code in octopus/modules/result.py
@define
class ModuleResult:
    """Unified result container for a single result type from a module.

    Carries all 5 artifacts (selected_features, scores, predictions,
    feature_importances, model) and knows how to save/load itself.
    Each result_type gets its own directory on disk.
    """

    result_type: ResultType = field()
    module: str = field()
    selected_features: list[str] = field(factory=list)
    scores: pd.DataFrame | None = field(default=None)
    predictions: pd.DataFrame | None = field(default=None)
    feature_importances: pd.DataFrame | None = field(default=None)
    model: Any = field(default=None)

    def save(self, result_dir: UPath) -> None:
        """Save this result to a directory.

        Stamps module + result_type columns on DataFrames, saves parquets,
        selected_features.json, and model/ subdirectory if model is not None.

        Args:
            result_dir: Directory to save into (e.g. task0/best/)
        """
        result_dir.mkdir(parents=True, exist_ok=True)

        # Save selected_features.json
        with (result_dir / "selected_features.json").open("w") as f:
            json.dump(self.selected_features, f)

        # Save DataFrames with module + result_type columns stamped
        for name, df in [
            ("scores", self.scores),
            ("predictions", self.predictions),
            ("feature_importances", self.feature_importances),
        ]:
            if df is not None and not df.empty:
                out = df.copy()
                out["module"] = self.module
                out["result_type"] = self.result_type.value
                path = result_dir / f"{name}.parquet"
                parquet_save(out, path)

        # Save model/ subdirectory if model exists
        if self.model is not None:
            model_dir = result_dir / "model"
            model_dir.mkdir(parents=True, exist_ok=True)
            joblib_save(self.model, model_dir / "model.joblib")
            predictor_state = {"selected_features": self.selected_features}
            with (model_dir / "predictor.json").open("w") as f:
                json.dump(predictor_state, f, indent=2)

    @classmethod
    def load(cls, result_dir: UPath, result_type: ResultType, module: str) -> "ModuleResult":
        """Load a ModuleResult from a saved directory.

        Args:
            result_dir: Directory containing saved result files
            result_type: The ResultType for this directory
            module: Module name

        Returns:
            Reconstructed ModuleResult instance
        """
        # Load selected features
        sf_path = result_dir / "selected_features.json"
        if sf_path.exists():
            with sf_path.open() as f:
                selected_features = json.load(f)
        else:
            selected_features = []

        # Load DataFrames (None if file doesn't exist)
        scores: pd.DataFrame | None = None
        predictions: pd.DataFrame | None = None
        feature_importances: pd.DataFrame | None = None

        for name in ["scores", "predictions", "feature_importances"]:
            path = result_dir / f"{name}.parquet"
            if path.exists():
                df = parquet_load(path)
                if name == "scores":
                    scores = df
                elif name == "predictions":
                    predictions = df
                elif name == "feature_importances":
                    feature_importances = df

        # Load model if exists
        model = None
        model_dir = result_dir / "model"
        model_path = model_dir / "model.joblib"
        if model_path.exists():
            model = joblib_load(model_path)

        return cls(
            result_type=result_type,
            module=module,
            selected_features=selected_features,
            scores=scores,
            predictions=predictions,
            feature_importances=feature_importances,
            model=model,
        )

load(result_dir, result_type, module) classmethod

Load a ModuleResult from a saved directory.

Parameters:

Name Type Description Default
result_dir UPath

Directory containing saved result files

required
result_type ResultType

The ResultType for this directory

required
module str

Module name

required

Returns:

Type Description
ModuleResult

Reconstructed ModuleResult instance

Source code in octopus/modules/result.py
@classmethod
def load(cls, result_dir: UPath, result_type: ResultType, module: str) -> "ModuleResult":
    """Load a ModuleResult from a saved directory.

    Args:
        result_dir: Directory containing saved result files
        result_type: The ResultType for this directory
        module: Module name

    Returns:
        Reconstructed ModuleResult instance
    """
    # Load selected features
    sf_path = result_dir / "selected_features.json"
    if sf_path.exists():
        with sf_path.open() as f:
            selected_features = json.load(f)
    else:
        selected_features = []

    # Load DataFrames (None if file doesn't exist)
    scores: pd.DataFrame | None = None
    predictions: pd.DataFrame | None = None
    feature_importances: pd.DataFrame | None = None

    for name in ["scores", "predictions", "feature_importances"]:
        path = result_dir / f"{name}.parquet"
        if path.exists():
            df = parquet_load(path)
            if name == "scores":
                scores = df
            elif name == "predictions":
                predictions = df
            elif name == "feature_importances":
                feature_importances = df

    # Load model if exists
    model = None
    model_dir = result_dir / "model"
    model_path = model_dir / "model.joblib"
    if model_path.exists():
        model = joblib_load(model_path)

    return cls(
        result_type=result_type,
        module=module,
        selected_features=selected_features,
        scores=scores,
        predictions=predictions,
        feature_importances=feature_importances,
        model=model,
    )

save(result_dir)

Save this result to a directory.

Stamps module + result_type columns on DataFrames, saves parquets, selected_features.json, and model/ subdirectory if model is not None.

Parameters:

Name Type Description Default
result_dir UPath

Directory to save into (e.g. task0/best/)

required
Source code in octopus/modules/result.py
def save(self, result_dir: UPath) -> None:
    """Save this result to a directory.

    Stamps module + result_type columns on DataFrames, saves parquets,
    selected_features.json, and model/ subdirectory if model is not None.

    Args:
        result_dir: Directory to save into (e.g. task0/best/)
    """
    result_dir.mkdir(parents=True, exist_ok=True)

    # Save selected_features.json
    with (result_dir / "selected_features.json").open("w") as f:
        json.dump(self.selected_features, f)

    # Save DataFrames with module + result_type columns stamped
    for name, df in [
        ("scores", self.scores),
        ("predictions", self.predictions),
        ("feature_importances", self.feature_importances),
    ]:
        if df is not None and not df.empty:
            out = df.copy()
            out["module"] = self.module
            out["result_type"] = self.result_type.value
            path = result_dir / f"{name}.parquet"
            parquet_save(out, path)

    # Save model/ subdirectory if model exists
    if self.model is not None:
        model_dir = result_dir / "model"
        model_dir.mkdir(parents=True, exist_ok=True)
        joblib_save(self.model, model_dir / "model.joblib")
        predictor_state = {"selected_features": self.selected_features}
        with (model_dir / "predictor.json").open("w") as f:
            json.dump(predictor_state, f, indent=2)

Mrmr

Bases: Task

MRMR module for feature selection based on mutual information and redundancy.

Uses the maximum relevance minimum redundancy algorithm to select features that are maximally relevant to the target while minimizing redundancy among selected features.

Configuration

n_features: Number of features to select correlation_type: Type of correlation to measure redundancy relevance_type: Method to calculate relevance (MRMRRelevance.PERMUTATION or MRMRRelevance.INTERNAL) results_module: Module name to filter prior results' feature importances (for permutation relevance) feature_importance_type: Type of FI aggregation (MRMRFIAggregation.MEAN or MRMRFIAggregation.COUNT) feature_importance_method: FI calculation method (FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP, FIComputeMethod.INTERNAL, FIComputeMethod.LOFO)

Source code in octopus/modules/mrmr/module.py
@define
class Mrmr(Task):
    """MRMR module for feature selection based on mutual information and redundancy.

    Uses the maximum relevance minimum redundancy algorithm to select features
    that are maximally relevant to the target while minimizing redundancy among
    selected features.

    Configuration:
        n_features: Number of features to select
        correlation_type: Type of correlation to measure redundancy
        relevance_type: Method to calculate relevance (MRMRRelevance.PERMUTATION or MRMRRelevance.INTERNAL)
        results_module: Module name to filter prior results' feature importances (for permutation relevance)
        feature_importance_type: Type of FI aggregation (MRMRFIAggregation.MEAN or MRMRFIAggregation.COUNT)
        feature_importance_method: FI calculation method (FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP, FIComputeMethod.INTERNAL, FIComputeMethod.LOFO)
    """

    n_features: int = field(validator=[validators.instance_of(int)], default=Factory(lambda: 30))
    """Number of features selected by MRMR."""

    correlation_type: CorrelationType = field(
        converter=CorrelationType,
        validator=validators.in_([CorrelationType.PEARSON, CorrelationType.SPEARMAN, CorrelationType.RDC]),
        default=CorrelationType.SPEARMAN,
    )
    """Selection of correlation type."""

    relevance_type: MRMRRelevance = field(
        converter=MRMRRelevance, validator=validators.in_(list(MRMRRelevance)), default=MRMRRelevance.PERMUTATION
    )
    """Selection of relevance measure."""

    results_module: str = field(
        validator=validators.instance_of(str),
        default="octo",
    )
    """Module name from which feature importances were created."""

    feature_importance_type: MRMRFIAggregation = field(
        converter=MRMRFIAggregation, validator=validators.in_(list(MRMRFIAggregation)), default=MRMRFIAggregation.MEAN
    )
    """Selection of feature importance type."""

    feature_importance_method: FIComputeMethod = field(
        converter=FIComputeMethod,
        validator=validators.in_(
            [FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP, FIComputeMethod.INTERNAL, FIComputeMethod.LOFO]
        ),
        default=FIComputeMethod.PERMUTATION,
    )
    """Selection of feature importance method."""

    def create_module(self) -> ModuleExecution:
        """Create MrmrModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import MrmrModule  # noqa: PLC0415

        return MrmrModule(config=self)

correlation_type = field(converter=CorrelationType, validator=(validators.in_([CorrelationType.PEARSON, CorrelationType.SPEARMAN, CorrelationType.RDC])), default=(CorrelationType.SPEARMAN)) class-attribute instance-attribute

Selection of correlation type.

feature_importance_method = field(converter=FIComputeMethod, validator=(validators.in_([FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP, FIComputeMethod.INTERNAL, FIComputeMethod.LOFO])), default=(FIComputeMethod.PERMUTATION)) class-attribute instance-attribute

Selection of feature importance method.

feature_importance_type = field(converter=MRMRFIAggregation, validator=(validators.in_(list(MRMRFIAggregation))), default=(MRMRFIAggregation.MEAN)) class-attribute instance-attribute

Selection of feature importance type.

n_features = field(validator=[validators.instance_of(int)], default=(Factory(lambda: 30))) class-attribute instance-attribute

Number of features selected by MRMR.

relevance_type = field(converter=MRMRRelevance, validator=(validators.in_(list(MRMRRelevance))), default=(MRMRRelevance.PERMUTATION)) class-attribute instance-attribute

Selection of relevance measure.

results_module = field(validator=(validators.instance_of(str)), default='octo') class-attribute instance-attribute

Module name from which feature importances were created.

create_module()

Create MrmrModule execution instance.

Source code in octopus/modules/mrmr/module.py
def create_module(self) -> ModuleExecution:
    """Create MrmrModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import MrmrModule  # noqa: PLC0415

    return MrmrModule(config=self)

Octo

Bases: Task

Octo module for feature selection and model optimization.

Uses Optuna for hyperparameter optimization with cross-validation, supporting: - Multiple ML models - MRMR feature selection - Ensemble selection - Bag-based model ensembling

Configuration

models: List of model names to optimize n_folds_inner: Number of inner CV folds n_trials: Number of Optuna trials ensemble_selection: Whether to perform ensemble selection mrmr_feature_numbers: Feature counts for MRMR feature selection

Source code in octopus/modules/octo/module.py
@define
class Octo(Task):
    """Octo module for feature selection and model optimization.

    Uses Optuna for hyperparameter optimization with cross-validation, supporting:
    - Multiple ML models
    - MRMR feature selection
    - Ensemble selection
    - Bag-based model ensembling

    Configuration:
        models: List of model names to optimize
        n_folds_inner: Number of inner CV folds
        n_trials: Number of Optuna trials
        ensemble_selection: Whether to perform ensemble selection
        mrmr_feature_numbers: Feature counts for MRMR feature selection
    """

    models: list[ModelName] | None = field(
        default=None,
        converter=_convert_models,
    )
    """Models for ML. If None, defaults are resolved at fit time based on ml_type."""

    n_folds_inner: int = field(validator=[validators.instance_of(int)], default=5)
    """Number of inner folds."""

    datasplit_seeds_inner: list[int] = field(
        default=Factory(lambda: [0]),
        validator=validators.deep_iterable(
            member_validator=validators.instance_of(int),
            iterable_validator=validators.instance_of(list),
        ),
    )
    """List of integers used as seeds for data splitting."""

    model_seed: int = field(validator=[validators.instance_of(int)], default=0)
    """Model seed."""

    n_jobs: int = field(validator=[validators.instance_of(int)], default=1)
    """Number of CPUs used for every model training."""

    max_outl: int = field(validator=[validators.instance_of(int)], default=3)
    """Maximum number of outliers, optimized by Optuna"""

    fi_methods_bestbag: list[FIComputeMethod] = field(
        default=Factory(lambda: [FIComputeMethod.PERMUTATION]),
        converter=lambda vs: [FIComputeMethod(v) for v in vs],
        validator=validators.deep_iterable(
            member_validator=validators.in_(
                [FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP, FIComputeMethod.CONSTANT]
            ),
            iterable_validator=validators.instance_of(list),
        ),
    )
    """Feature importance methods for best bag."""

    inner_parallelization: bool = field(validator=[validators.instance_of(bool)], default=True)
    """Enable inner parallelization. Defaults is True."""

    n_workers: int = field(default=None)
    """Number of workers."""

    optuna_seed: int = field(validator=[validators.instance_of(int)], default=0)
    """Seed for Optuna TPESampler, default=0"""

    n_optuna_startup_trials: int = field(validator=[validators.instance_of(int)], default=15)
    """Number of Optuna startup trials (random sampler)"""

    ensemble_selection: bool = field(validator=[validators.in_([True, False])], default=False)
    """Whether to perform ensemble selection."""

    ensel_n_save_trials: int = field(validator=[validators.instance_of(int)], default=50)
    """Number of top trials to be saved for ensemble selection (bags)."""

    n_trials: int = field(validator=[validators.instance_of(int)], default=200 if not _RUNNING_IN_TESTSUITE else 3)
    """Number of Optuna trials."""

    hyperparameters: dict = field(validator=[validators.instance_of(dict)], default=Factory(dict))
    """Bring own hyperparameter space."""

    max_features: int = field(validator=[validators.instance_of(int)], default=0)
    """Maximum features to constrain hyperparameter optimization. Default is zero (off)."""

    penalty_factor: float = field(validator=[validators.instance_of(float)], default=1.0)
    """Factor to penalize optuna target related to feature constraint."""

    mrmr_feature_numbers: list = field(validator=[validators.instance_of(list)], default=Factory(list))
    """List of feature numbers to be investigated by mrmr."""

    optuna_return: OptunaReturnType = field(
        default=OptunaReturnType.POOL,
        converter=OptunaReturnType,
        validator=validators.in_(list(OptunaReturnType)),
    )
    """How to calculate the bag performance for the optuna optimization target."""

    def __attrs_post_init__(self):
        # (1) set default of n_workers to n_folds_inner
        if self.n_workers is None:
            self.n_workers = self.n_folds_inner
        if self.n_workers != self.n_folds_inner:
            logger.warning(
                f"Octofull Warning: n_workers ({self.n_workers}) does not match n_folds_inner ({self.n_folds_inner})",
            )
        # (2) Only enforce constrained-HPO compatibility when max_features > 0 and models are specified
        if self.max_features > 0 and self.models is not None:
            incompatible_models: list[ModelName] = []

            for m in self.models:
                config = Models.get_config(m)
                chpo_flag = config.chpo_compatible
                logger.info(f"Model '{m}': chpo_compatible={chpo_flag}")

                if not chpo_flag:
                    incompatible_models.append(m)

            if incompatible_models:
                msg = (
                    "Octo: The following models are not compatible with constrained HPO. "
                    "Please remove those model or turn constrained HPO off (max_features=0): "
                    + ", ".join(incompatible_models)
                )
                logger.error(msg)
                raise ValueError(msg)

    def create_module(self) -> ModuleExecution:
        """Create OctoModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import OctoModuleTemplate  # noqa: PLC0415

        return OctoModuleTemplate(config=self)

datasplit_seeds_inner = field(default=(Factory(lambda: [0])), validator=(validators.deep_iterable(member_validator=(validators.instance_of(int)), iterable_validator=(validators.instance_of(list))))) class-attribute instance-attribute

List of integers used as seeds for data splitting.

ensel_n_save_trials = field(validator=[validators.instance_of(int)], default=50) class-attribute instance-attribute

Number of top trials to be saved for ensemble selection (bags).

ensemble_selection = field(validator=[validators.in_([True, False])], default=False) class-attribute instance-attribute

Whether to perform ensemble selection.

fi_methods_bestbag = field(default=(Factory(lambda: [FIComputeMethod.PERMUTATION])), converter=(lambda vs: [(FIComputeMethod(v)) for v in vs]), validator=(validators.deep_iterable(member_validator=(validators.in_([FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP, FIComputeMethod.CONSTANT])), iterable_validator=(validators.instance_of(list))))) class-attribute instance-attribute

Feature importance methods for best bag.

hyperparameters = field(validator=[validators.instance_of(dict)], default=(Factory(dict))) class-attribute instance-attribute

Bring own hyperparameter space.

inner_parallelization = field(validator=[validators.instance_of(bool)], default=True) class-attribute instance-attribute

Enable inner parallelization. Defaults is True.

max_features = field(validator=[validators.instance_of(int)], default=0) class-attribute instance-attribute

Maximum features to constrain hyperparameter optimization. Default is zero (off).

max_outl = field(validator=[validators.instance_of(int)], default=3) class-attribute instance-attribute

Maximum number of outliers, optimized by Optuna

model_seed = field(validator=[validators.instance_of(int)], default=0) class-attribute instance-attribute

Model seed.

models = field(default=None, converter=_convert_models) class-attribute instance-attribute

Models for ML. If None, defaults are resolved at fit time based on ml_type.

mrmr_feature_numbers = field(validator=[validators.instance_of(list)], default=(Factory(list))) class-attribute instance-attribute

List of feature numbers to be investigated by mrmr.

n_folds_inner = field(validator=[validators.instance_of(int)], default=5) class-attribute instance-attribute

Number of inner folds.

n_jobs = field(validator=[validators.instance_of(int)], default=1) class-attribute instance-attribute

Number of CPUs used for every model training.

n_optuna_startup_trials = field(validator=[validators.instance_of(int)], default=15) class-attribute instance-attribute

Number of Optuna startup trials (random sampler)

n_trials = field(validator=[validators.instance_of(int)], default=(200 if not _RUNNING_IN_TESTSUITE else 3)) class-attribute instance-attribute

Number of Optuna trials.

n_workers = field(default=None) class-attribute instance-attribute

Number of workers.

optuna_return = field(default=(OptunaReturnType.POOL), converter=OptunaReturnType, validator=(validators.in_(list(OptunaReturnType)))) class-attribute instance-attribute

How to calculate the bag performance for the optuna optimization target.

optuna_seed = field(validator=[validators.instance_of(int)], default=0) class-attribute instance-attribute

Seed for Optuna TPESampler, default=0

penalty_factor = field(validator=[validators.instance_of(float)], default=1.0) class-attribute instance-attribute

Factor to penalize optuna target related to feature constraint.

create_module()

Create OctoModule execution instance.

Source code in octopus/modules/octo/module.py
def create_module(self) -> ModuleExecution:
    """Create OctoModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import OctoModuleTemplate  # noqa: PLC0415

    return OctoModuleTemplate(config=self)

ResultType

Bases: StrEnum

Types of results produced by modules.

Source code in octopus/types.py
class ResultType(StrEnum):
    """Types of results produced by modules."""

    BEST = "best"
    ENSEMBLE_SELECTION = "ensemble_selection"

Rfe

Bases: Task

RFE module for recursive feature elimination.

Uses sklearn's RFECV with hyperparameter optimization to recursively eliminate features based on feature importances.

Configuration

model: Model to use for RFE (defaults to CatBoost based on ml_type) step: Number of features to remove at each iteration min_features_to_select: Minimum number of features to keep cv: Number of CV folds for RFECV mode: RFEMode.FIXED (use optimized model) or RFEMode.REFIT (reoptimize at each step)

Source code in octopus/modules/rfe/module.py
@define
class Rfe(Task):
    """RFE module for recursive feature elimination.

    Uses sklearn's RFECV with hyperparameter optimization to recursively
    eliminate features based on feature importances.

    Configuration:
        model: Model to use for RFE (defaults to CatBoost based on ml_type)
        step: Number of features to remove at each iteration
        min_features_to_select: Minimum number of features to keep
        cv: Number of CV folds for RFECV
        mode: RFEMode.FIXED (use optimized model) or RFEMode.REFIT (reoptimize at each step)
    """

    model: str = field(validator=[validators.instance_of(str)], default="")
    """Model used by RFE (empty string uses default for ml_type)."""

    step: int = field(validator=[validators.instance_of(int)], default=1)
    """Number of features to remove at each iteration."""

    min_features_to_select: int = field(validator=[validators.instance_of(int)], default=1)
    """Minimum number of features to be selected."""

    cv: int = field(validator=[validators.instance_of(int)], default=5)
    """Number of CV folds for RFE_CV."""

    mode: RFEMode = field(converter=RFEMode, validator=validators.in_(list(RFEMode)), default=RFEMode.FIXED)
    """Mode used by RFE: fixed=optimized model, refit=reoptimize each step."""

    def create_module(self) -> ModuleExecution:
        """Create RfeModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import RfeModule  # noqa: PLC0415

        return RfeModule(config=self)

cv = field(validator=[validators.instance_of(int)], default=5) class-attribute instance-attribute

Number of CV folds for RFE_CV.

min_features_to_select = field(validator=[validators.instance_of(int)], default=1) class-attribute instance-attribute

Minimum number of features to be selected.

mode = field(converter=RFEMode, validator=(validators.in_(list(RFEMode))), default=(RFEMode.FIXED)) class-attribute instance-attribute

Mode used by RFE: fixed=optimized model, refit=reoptimize each step.

model = field(validator=[validators.instance_of(str)], default='') class-attribute instance-attribute

Model used by RFE (empty string uses default for ml_type).

step = field(validator=[validators.instance_of(int)], default=1) class-attribute instance-attribute

Number of features to remove at each iteration.

create_module()

Create RfeModule execution instance.

Source code in octopus/modules/rfe/module.py
def create_module(self) -> ModuleExecution:
    """Create RfeModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import RfeModule  # noqa: PLC0415

    return RfeModule(config=self)

Rfe2

Bases: Octo

Rfe2 module for recursive feature elimination with Octo optimization.

Extends Octo to add RFE functionality. First runs Octo optimization to get a best bag, then iteratively removes features based on feature importances.

Configuration

(inherits all Octo configuration) min_features_to_select: Minimum number of features to keep fi_method_rfe: Feature importance method for RFE selection_method: Method to select best solution (best or parsimonious) abs_on_fi: Convert negative feature importances to positive

Source code in octopus/modules/rfe2/module.py
@define
class Rfe2(Octo):
    """Rfe2 module for recursive feature elimination with Octo optimization.

    Extends Octo to add RFE functionality. First runs Octo optimization to get
    a best bag, then iteratively removes features based on feature importances.

    Configuration:
        (inherits all Octo configuration)
        min_features_to_select: Minimum number of features to keep
        fi_method_rfe: Feature importance method for RFE
        selection_method: Method to select best solution (best or parsimonious)
        abs_on_fi: Convert negative feature importances to positive
    """

    min_features_to_select: int = field(validator=[validators.instance_of(int)], default=1)
    """Minimum number of features to be selected."""

    fi_method_rfe: FIComputeMethod = field(
        converter=FIComputeMethod,
        validator=validators.in_([FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP]),
        default=FIComputeMethod.PERMUTATION,
    )
    """Feature importance method for RFE."""

    selection_method: RFE2SelectionMethod = field(
        converter=RFE2SelectionMethod,
        validator=validators.in_(list(RFE2SelectionMethod)),
        default=RFE2SelectionMethod.BEST,
    )
    """Method to select best solution. Parsimonious: smallest solutions within sem."""

    abs_on_fi: bool = field(validator=[validators.instance_of(bool)], default=False)
    """Convert negative feature importances to positive (abs())."""

    def __attrs_post_init__(self):
        # Call parent post_init
        super().__attrs_post_init__()

        # overwrite fi_methods_bestbag for Octo
        self.fi_methods_bestbag = [self.fi_method_rfe]

    def create_module(self) -> ModuleExecution:
        """Create Rfe2Module execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import Rfe2Module  # noqa: PLC0415

        return Rfe2Module(config=self)

abs_on_fi = field(validator=[validators.instance_of(bool)], default=False) class-attribute instance-attribute

Convert negative feature importances to positive (abs()).

fi_method_rfe = field(converter=FIComputeMethod, validator=(validators.in_([FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP])), default=(FIComputeMethod.PERMUTATION)) class-attribute instance-attribute

Feature importance method for RFE.

min_features_to_select = field(validator=[validators.instance_of(int)], default=1) class-attribute instance-attribute

Minimum number of features to be selected.

selection_method = field(converter=RFE2SelectionMethod, validator=(validators.in_(list(RFE2SelectionMethod))), default=(RFE2SelectionMethod.BEST)) class-attribute instance-attribute

Method to select best solution. Parsimonious: smallest solutions within sem.

create_module()

Create Rfe2Module execution instance.

Source code in octopus/modules/rfe2/module.py
def create_module(self) -> ModuleExecution:
    """Create Rfe2Module execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import Rfe2Module  # noqa: PLC0415

    return Rfe2Module(config=self)

Roc

Bases: Task

ROC module for removing correlated features.

This module identifies groups of correlated features and selects the most informative feature from each group, removing the rest. Uses correlation analysis (Spearman or RDC) combined with feature filtering (mutual information or F-statistics) to determine which features to keep.

Configuration

threshold: Correlation threshold above which features are considered correlated correlation_type: Type of correlation measure (CorrelationType.SPEARMAN or CorrelationType.RDC) filter_type: Method to select best feature in group (ROCFilterMethod.MUTUAL_INFO or ROCFilterMethod.F_STATISTICS)

Source code in octopus/modules/roc/module.py
@define
class Roc(Task):
    """ROC module for removing correlated features.

    This module identifies groups of correlated features and selects the most
    informative feature from each group, removing the rest. Uses correlation
    analysis (Spearman or RDC) combined with feature filtering (mutual information
    or F-statistics) to determine which features to keep.

    Configuration:
        threshold: Correlation threshold above which features are considered correlated
        correlation_type: Type of correlation measure (CorrelationType.SPEARMAN or CorrelationType.RDC)
        filter_type: Method to select best feature in group (ROCFilterMethod.MUTUAL_INFO or ROCFilterMethod.F_STATISTICS)
    """

    threshold: float = field(validator=[validators.instance_of(float)], default=0.8)
    """Threshold for feature removal (features with correlation > threshold are grouped)."""

    correlation_type: CorrelationType = field(
        converter=CorrelationType,
        validator=validators.in_([CorrelationType.SPEARMAN, CorrelationType.RDC]),
        default=CorrelationType.SPEARMAN,
    )
    """Selection of correlation type."""

    filter_type: ROCFilterMethod = field(
        converter=ROCFilterMethod,
        validator=validators.in_([ROCFilterMethod.MUTUAL_INFO, ROCFilterMethod.F_STATISTICS]),
        default=ROCFilterMethod.F_STATISTICS,
    )
    """Selection of filter type for correlated features."""

    def create_module(self) -> ModuleExecution:
        """Create RocModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import RocModule  # noqa: PLC0415

        return RocModule(config=self)

correlation_type = field(converter=CorrelationType, validator=(validators.in_([CorrelationType.SPEARMAN, CorrelationType.RDC])), default=(CorrelationType.SPEARMAN)) class-attribute instance-attribute

Selection of correlation type.

filter_type = field(converter=ROCFilterMethod, validator=(validators.in_([ROCFilterMethod.MUTUAL_INFO, ROCFilterMethod.F_STATISTICS])), default=(ROCFilterMethod.F_STATISTICS)) class-attribute instance-attribute

Selection of filter type for correlated features.

threshold = field(validator=[validators.instance_of(float)], default=0.8) class-attribute instance-attribute

Threshold for feature removal (features with correlation > threshold are grouped).

create_module()

Create RocModule execution instance.

Source code in octopus/modules/roc/module.py
def create_module(self) -> ModuleExecution:
    """Create RocModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import RocModule  # noqa: PLC0415

    return RocModule(config=self)

Sfs

Bases: Task

SFS module for sequential feature selection.

Uses sequential feature selection (forward, backward, or floating variants) to find the optimal feature subset.

Configuration

model: Model to use for SFS (defaults based on ml_type) cv: Number of CV folds sfs_type: Type of SFS (forward, backward, floating_forward, floating_backward)

Source code in octopus/modules/sfs/module.py
@define
class Sfs(Task):
    """SFS module for sequential feature selection.

    Uses sequential feature selection (forward, backward, or floating variants)
    to find the optimal feature subset.

    Configuration:
        model: Model to use for SFS (defaults based on ml_type)
        cv: Number of CV folds
        sfs_type: Type of SFS (forward, backward, floating_forward, floating_backward)
    """

    model: str = field(validator=[validators.instance_of(str)], default="")
    """Model used by SFS."""

    cv: int = field(validator=[validators.instance_of(int)], default=5)
    """Number of CV folds for SFS."""

    sfs_type: SFSDirection = field(
        converter=SFSDirection, validator=validators.in_(list(SFSDirection)), default=SFSDirection.BACKWARD
    )
    """SFS type used."""

    def create_module(self) -> ModuleExecution:
        """Create SfsModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import SfsModule  # noqa: PLC0415

        return SfsModule(config=self)

cv = field(validator=[validators.instance_of(int)], default=5) class-attribute instance-attribute

Number of CV folds for SFS.

model = field(validator=[validators.instance_of(str)], default='') class-attribute instance-attribute

Model used by SFS.

sfs_type = field(converter=SFSDirection, validator=(validators.in_(list(SFSDirection))), default=(SFSDirection.BACKWARD)) class-attribute instance-attribute

SFS type used.

create_module()

Create SfsModule execution instance.

Source code in octopus/modules/sfs/module.py
def create_module(self) -> ModuleExecution:
    """Create SfsModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import SfsModule  # noqa: PLC0415

    return SfsModule(config=self)

StudyContext

Immutable runtime context passed to modules during fit().

Contains only the finalized/prepared values needed by modules. No OctoStudy dependency - only attrs + upath.

Source code in octopus/modules/context.py
@frozen
class StudyContext:
    """Immutable runtime context passed to modules during fit().

    Contains only the finalized/prepared values needed by modules.
    No OctoStudy dependency - only attrs + upath.
    """

    ml_type: MLType
    """MLType enum (e.g. MLType.BINARY, MLType.REGRESSION, MLType.TIMETOEVENT)."""

    target_metric: str
    """Primary metric for model evaluation."""

    target_assignments: dict[str, str]
    """Target column assignments (e.g. {'default': 'target'} or {'duration': ..., 'event': ...})."""

    positive_class: int | None
    """Positive class label for binary classification. None for regression/multiclass."""

    stratification_col: str | None
    """Column used for stratification during data splitting."""

    sample_id_col: str
    """Identifier for sample instances."""

    feature_cols: list[str]
    """Prepared feature columns (from PreparedData.feature_cols)."""

    row_id_col: str
    """Prepared row identifier (from PreparedData.row_id_col)."""

    output_path: UPath
    """Full output path for this study."""

    log_dir: UPath
    """Directory where logs are stored."""

feature_cols instance-attribute

Prepared feature columns (from PreparedData.feature_cols).

log_dir instance-attribute

Directory where logs are stored.

ml_type instance-attribute

MLType enum (e.g. MLType.BINARY, MLType.REGRESSION, MLType.TIMETOEVENT).

output_path instance-attribute

Full output path for this study.

positive_class instance-attribute

Positive class label for binary classification. None for regression/multiclass.

row_id_col instance-attribute

Prepared row identifier (from PreparedData.row_id_col).

sample_id_col instance-attribute

Identifier for sample instances.

stratification_col instance-attribute

Column used for stratification during data splitting.

target_assignments instance-attribute

Target column assignments (e.g. {'default': 'target'} or {'duration': ..., 'event': ...}).

target_metric instance-attribute

Primary metric for model evaluation.

Task

Bases: ABC

Base config class for all workflow tasks.

Source code in octopus/modules/base.py
@define
class Task(ABC):
    """Base config class for all workflow tasks."""

    task_id: int = field(validator=[validators.instance_of(int), validators.ge(0)])
    depends_on: int | None = field(default=None, validator=validators.optional(validators.instance_of(int)))
    description: str = field(default="", validator=[validators.instance_of(str)])
    categorical_encoding: bool = field(default=False, validator=[validators.instance_of(bool)])

    @property
    def module(self) -> str:
        """Module name derived from class name."""
        return type(self).__name__.lower()

    @abstractmethod
    def create_module(self) -> ModuleExecution:
        """Create an execution module from this config."""
        raise NotImplementedError("Subclasses must implement create_module()")

module property

Module name derived from class name.

create_module() abstractmethod

Create an execution module from this config.

Source code in octopus/modules/base.py
@abstractmethod
def create_module(self) -> ModuleExecution:
    """Create an execution module from this config."""
    raise NotImplementedError("Subclasses must implement create_module()")

rdc_correlation_matrix(df)

Calculate RDC correlation matrix.

Source code in octopus/modules/utils.py
def rdc_correlation_matrix(df):
    """Calculate RDC correlation matrix."""
    features = df.columns
    n_features = len(features)
    rdc_matrix = np.zeros((n_features, n_features))

    # Calculate RDC for each pair of features
    for i in range(n_features):
        for j in range(i, n_features):
            if i == j:
                rdc_matrix[i, j] = 1.0
            else:
                rdc_value = rdc(df.iloc[:, i].values, df.iloc[:, j].values)
                rdc_matrix[i, j] = rdc_value
                rdc_matrix[j, i] = rdc_value

    return rdc_matrix