Skip to content

Modules

Init modules.

AutoGluon

AutoGluon module placeholder when AutoGluon is not installed.

Source code in octopus/modules/__init__.py
class AutoGluon:  # type: ignore[no-redef]
    """AutoGluon module placeholder when AutoGluon is not installed."""

    def __init__(self, *args, **kwargs):
        raise ImportError(
            "AutoGluon is not installed. Please install it with `pip install octopus[autogluon]` to use this module."
        )

Boruta

Bases: Task

Boruta module for feature selection.

Uses the Boruta algorithm to identify all relevant features by comparing importance scores with shadow features.

Configuration

model: Model to use for Boruta (defaults based on ml_type) n_inner_splits: Number of CV folds threshold: Percentile threshold for shadow feature comparison (0-100) alpha: Significance level for p-values (0-1)

Source code in octopus/modules/boruta/module.py
@define
class Boruta(Task):
    """Boruta module for feature selection.

    Uses the Boruta algorithm to identify all relevant features by comparing
    importance scores with shadow features.

    Configuration:
        model: Model to use for Boruta (defaults based on ml_type)
        n_inner_splits: Number of CV folds
        threshold: Percentile threshold for shadow feature comparison (0-100)
        alpha: Significance level for p-values (0-1)
    """

    model: ModelName | None = field(default=None, converter=lambda v: ModelName(v) if v is not None else None)
    """Model used by Boruta. If None, defaults are resolved at fit time based on ml_type."""

    n_inner_splits: int = field(validator=[validators.instance_of(int)], default=5)
    """Number of inner folds."""

    threshold: int = field(
        default=100,
        validator=[validators.instance_of(int), validators.ge(0), validators.le(100)],
    )
    """Percentile threshold for comparison between shadow and real features (0-100)."""

    alpha: float = field(
        default=0.05,
        validator=[validators.instance_of(float), validators.gt(0), validators.lt(1)],
    )
    """Significance level at which the corrected p-values will get rejected (0-1)."""

    def create_module(self) -> ModuleExecution:
        """Create BorutaModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import BorutaModule  # noqa: PLC0415

        return BorutaModule(config=self)

alpha = field(default=0.05, validator=[validators.instance_of(float), validators.gt(0), validators.lt(1)]) class-attribute instance-attribute

Significance level at which the corrected p-values will get rejected (0-1).

model = field(default=None, converter=(lambda v: ModelName(v) if v is not None else None)) class-attribute instance-attribute

Model used by Boruta. If None, defaults are resolved at fit time based on ml_type.

n_inner_splits = field(validator=[validators.instance_of(int)], default=5) class-attribute instance-attribute

Number of inner folds.

threshold = field(default=100, validator=[validators.instance_of(int), validators.ge(0), validators.le(100)]) class-attribute instance-attribute

Percentile threshold for comparison between shadow and real features (0-100).

create_module()

Create BorutaModule execution instance.

Source code in octopus/modules/boruta/module.py
def create_module(self) -> ModuleExecution:
    """Create BorutaModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import BorutaModule  # noqa: PLC0415

    return BorutaModule(config=self)

DataPartition

Bases: StrEnum

Dataset partitions for feature importance computation.

Source code in octopus/types.py
class DataPartition(StrEnum):
    """Dataset partitions for feature importance computation."""

    TRAIN = "train"
    DEV = "dev"
    TEST = "test"

FIResultLabel

Bases: StrEnum

Labels used in feature-importance result DataFrames.

Every module writes a fi_method column into its result DataFrame. Use these members as the column values so downstream code can filter and aggregate results reliably.

Source code in octopus/types.py
class FIResultLabel(StrEnum):
    """Labels used in feature-importance result DataFrames.

    Every module writes a ``fi_method`` column into its result DataFrame.
    Use these members as the column values so downstream code can filter
    and aggregate results reliably.
    """

    INTERNAL = "internal"
    PERMUTATION = "permutation"
    SHAP = "shap"
    LOFO = "lofo"
    CONSTANT = "constant"
    COUNTS = "counts"
    COUNTS_RELATIVE = "counts_relative"

ModuleExecution

Bases: ABC

Base execution class. Created on worker via config.create_module().

Source code in octopus/modules/base.py
@define
class ModuleExecution[T: Task](ABC):
    """Base execution class. Created on worker via config.create_module()."""

    config: T = field()

    @abstractmethod
    def fit(
        self,
        *,
        data_traindev: pd.DataFrame,
        data_test: pd.DataFrame,
        feature_cols: list[str],
        study_context: StudyContext,
        outer_split_id: int,
        results_dir: UPath,
        scratch_dir: UPath,
        n_assigned_cpus: int,
        feature_groups: dict[str, list[str]],
        dependency_results: dict[ResultType, ModuleResult],
        **kwargs,
    ) -> dict[ResultType, ModuleResult]:
        """Fit the module. Returns dict mapping ResultType to ModuleResult."""
        raise NotImplementedError("Subclasses must implement fit()")

    def is_fitted(self) -> bool:
        """Check if module has been fitted."""
        if hasattr(self, "selected_features_"):
            return self.selected_features_ is not None
        return False

fit(*, data_traindev, data_test, feature_cols, study_context, outer_split_id, results_dir, scratch_dir, n_assigned_cpus, feature_groups, dependency_results, **kwargs) abstractmethod

Fit the module. Returns dict mapping ResultType to ModuleResult.

Source code in octopus/modules/base.py
@abstractmethod
def fit(
    self,
    *,
    data_traindev: pd.DataFrame,
    data_test: pd.DataFrame,
    feature_cols: list[str],
    study_context: StudyContext,
    outer_split_id: int,
    results_dir: UPath,
    scratch_dir: UPath,
    n_assigned_cpus: int,
    feature_groups: dict[str, list[str]],
    dependency_results: dict[ResultType, ModuleResult],
    **kwargs,
) -> dict[ResultType, ModuleResult]:
    """Fit the module. Returns dict mapping ResultType to ModuleResult."""
    raise NotImplementedError("Subclasses must implement fit()")

is_fitted()

Check if module has been fitted.

Source code in octopus/modules/base.py
def is_fitted(self) -> bool:
    """Check if module has been fitted."""
    if hasattr(self, "selected_features_"):
        return self.selected_features_ is not None
    return False

ModuleResult

Unified result container for a single result type from a module.

Carries all 5 artifacts (selected_features, scores, predictions, fi, model) and knows how to save/load itself. Each result_type gets its own directory on disk.

Source code in octopus/modules/result.py
@define
class ModuleResult:
    """Unified result container for a single result type from a module.

    Carries all 5 artifacts (selected_features, scores, predictions,
    fi, model) and knows how to save/load itself.
    Each result_type gets its own directory on disk.
    """

    result_type: ResultType = field()
    module: str = field()
    selected_features: list[str] = field(factory=list)
    scores: pd.DataFrame | None = field(default=None)
    predictions: pd.DataFrame | None = field(default=None)
    fi: pd.DataFrame | None = field(default=None)
    model: Any = field(default=None)

    def save(self, result_dir: UPath) -> None:
        """Save this result to a directory.

        Stamps module + result_type columns on DataFrames, saves parquets,
        selected_features.json, and model/ subdirectory if model is not None.

        Args:
            result_dir: Directory to save into (e.g. task0/best/)
        """
        result_dir.mkdir(parents=True, exist_ok=True)

        # Save selected_features.json
        with (result_dir / "selected_features.json").open("w") as f:
            json.dump(self.selected_features, f)

        # Save DataFrames with module + result_type columns stamped
        for name, df in [
            ("scores", self.scores),
            ("predictions", self.predictions),
            ("feature_importances", self.fi),
        ]:
            if df is not None and not df.empty:
                out = df.copy()
                out["module"] = self.module
                out["result_type"] = self.result_type.value
                path = result_dir / f"{name}.parquet"
                parquet_save(out, path)

        # Save model/ subdirectory if model exists
        if self.model is not None:
            model_dir = result_dir / "model"
            model_dir.mkdir(parents=True, exist_ok=True)
            joblib_save(self.model, model_dir / "model.joblib")
            predictor_state = {"selected_features": self.selected_features}
            with (model_dir / "predictor.json").open("w") as f:
                json.dump(predictor_state, f, indent=2)

    @classmethod
    def load(cls, result_dir: UPath, result_type: ResultType, module: str) -> "ModuleResult":
        """Load a ModuleResult from a saved directory.

        Args:
            result_dir: Directory containing saved result files
            result_type: The ResultType for this directory
            module: Module name

        Returns:
            Reconstructed ModuleResult instance
        """
        # Load selected features
        sf_path = result_dir / "selected_features.json"
        if sf_path.exists():
            with sf_path.open() as f:
                selected_features = json.load(f)
        else:
            selected_features = []

        # Load DataFrames (None if file doesn't exist)
        scores: pd.DataFrame | None = None
        predictions: pd.DataFrame | None = None
        fi_df: pd.DataFrame | None = None

        for name in ["scores", "predictions", "feature_importances"]:
            path = result_dir / f"{name}.parquet"
            if path.exists():
                df = parquet_load(path)
                if name == "scores":
                    scores = df
                elif name == "predictions":
                    predictions = df
                elif name == "feature_importances":
                    fi_df = df

        # Load model if exists
        model = None
        model_dir = result_dir / "model"
        model_path = model_dir / "model.joblib"
        if model_path.exists():
            model = joblib_load(model_path)

        return cls(
            result_type=result_type,
            module=module,
            selected_features=selected_features,
            scores=scores,
            predictions=predictions,
            fi=fi_df,
            model=model,
        )

load(result_dir, result_type, module) classmethod

Load a ModuleResult from a saved directory.

Parameters:

Name Type Description Default
result_dir UPath

Directory containing saved result files

required
result_type ResultType

The ResultType for this directory

required
module str

Module name

required

Returns:

Type Description
ModuleResult

Reconstructed ModuleResult instance

Source code in octopus/modules/result.py
@classmethod
def load(cls, result_dir: UPath, result_type: ResultType, module: str) -> "ModuleResult":
    """Load a ModuleResult from a saved directory.

    Args:
        result_dir: Directory containing saved result files
        result_type: The ResultType for this directory
        module: Module name

    Returns:
        Reconstructed ModuleResult instance
    """
    # Load selected features
    sf_path = result_dir / "selected_features.json"
    if sf_path.exists():
        with sf_path.open() as f:
            selected_features = json.load(f)
    else:
        selected_features = []

    # Load DataFrames (None if file doesn't exist)
    scores: pd.DataFrame | None = None
    predictions: pd.DataFrame | None = None
    fi_df: pd.DataFrame | None = None

    for name in ["scores", "predictions", "feature_importances"]:
        path = result_dir / f"{name}.parquet"
        if path.exists():
            df = parquet_load(path)
            if name == "scores":
                scores = df
            elif name == "predictions":
                predictions = df
            elif name == "feature_importances":
                fi_df = df

    # Load model if exists
    model = None
    model_dir = result_dir / "model"
    model_path = model_dir / "model.joblib"
    if model_path.exists():
        model = joblib_load(model_path)

    return cls(
        result_type=result_type,
        module=module,
        selected_features=selected_features,
        scores=scores,
        predictions=predictions,
        fi=fi_df,
        model=model,
    )

save(result_dir)

Save this result to a directory.

Stamps module + result_type columns on DataFrames, saves parquets, selected_features.json, and model/ subdirectory if model is not None.

Parameters:

Name Type Description Default
result_dir UPath

Directory to save into (e.g. task0/best/)

required
Source code in octopus/modules/result.py
def save(self, result_dir: UPath) -> None:
    """Save this result to a directory.

    Stamps module + result_type columns on DataFrames, saves parquets,
    selected_features.json, and model/ subdirectory if model is not None.

    Args:
        result_dir: Directory to save into (e.g. task0/best/)
    """
    result_dir.mkdir(parents=True, exist_ok=True)

    # Save selected_features.json
    with (result_dir / "selected_features.json").open("w") as f:
        json.dump(self.selected_features, f)

    # Save DataFrames with module + result_type columns stamped
    for name, df in [
        ("scores", self.scores),
        ("predictions", self.predictions),
        ("feature_importances", self.fi),
    ]:
        if df is not None and not df.empty:
            out = df.copy()
            out["module"] = self.module
            out["result_type"] = self.result_type.value
            path = result_dir / f"{name}.parquet"
            parquet_save(out, path)

    # Save model/ subdirectory if model exists
    if self.model is not None:
        model_dir = result_dir / "model"
        model_dir.mkdir(parents=True, exist_ok=True)
        joblib_save(self.model, model_dir / "model.joblib")
        predictor_state = {"selected_features": self.selected_features}
        with (model_dir / "predictor.json").open("w") as f:
            json.dump(predictor_state, f, indent=2)

Mrmr

Bases: Task

MRMR module for feature selection based on mutual information and redundancy.

Uses the maximum relevance minimum redundancy algorithm to select features that are maximally relevant to the target while minimizing redundancy among selected features.

Configuration

n_features: Number of features to select correlation_type: Type of correlation to measure redundancy relevance_type: Method to calculate relevance (MRMRRelevance.FROM_DEPENDENCY or MRMRRelevance.F_STATISTICS) feature_importance_type: FI aggregation type (only used with FROM_DEPENDENCY relevance) feature_importance_method: FI method to filter from dependency task (only used with FROM_DEPENDENCY relevance)

Source code in octopus/modules/mrmr/module.py
@define
class Mrmr(Task):
    """MRMR module for feature selection based on mutual information and redundancy.

    Uses the maximum relevance minimum redundancy algorithm to select features
    that are maximally relevant to the target while minimizing redundancy among
    selected features.

    Configuration:
        n_features: Number of features to select
        correlation_type: Type of correlation to measure redundancy
        relevance_type: Method to calculate relevance (MRMRRelevance.FROM_DEPENDENCY or MRMRRelevance.F_STATISTICS)
        feature_importance_type: FI aggregation type (only used with FROM_DEPENDENCY relevance)
        feature_importance_method: FI method to filter from dependency task (only used with FROM_DEPENDENCY relevance)
    """

    n_features: int = field(validator=[validators.instance_of(int)], default=Factory(lambda: 30))
    """Number of features selected by MRMR."""

    correlation_type: CorrelationType = field(
        converter=CorrelationType,
        validator=validators.in_([CorrelationType.PEARSON, CorrelationType.SPEARMAN, CorrelationType.RDC]),
        default=CorrelationType.SPEARMAN,
    )
    """Selection of correlation type."""

    relevance_type: MRMRRelevance = field(
        converter=MRMRRelevance, validator=validators.in_(list(MRMRRelevance)), default=MRMRRelevance.FROM_DEPENDENCY
    )
    """Method to calculate relevance (permutation or f-statistics)."""

    feature_importance_type: MRMRFIAggregation = field(
        converter=MRMRFIAggregation, validator=validators.in_(list(MRMRFIAggregation)), default=MRMRFIAggregation.MEAN
    )
    """FI aggregation type. Only used when relevance_type is FROM_DEPENDENCY."""

    feature_importance_method: FIComputeMethod = field(
        converter=FIComputeMethod,
        validator=validators.in_([FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP, FIComputeMethod.LOFO]),
        default=FIComputeMethod.PERMUTATION,
    )
    """FI method to filter from the dependency task's results. Only used when relevance_type is FROM_DEPENDENCY."""

    def create_module(self) -> ModuleExecution:
        """Create MrmrModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import MrmrModule  # noqa: PLC0415

        return MrmrModule(config=self)

correlation_type = field(converter=CorrelationType, validator=(validators.in_([CorrelationType.PEARSON, CorrelationType.SPEARMAN, CorrelationType.RDC])), default=(CorrelationType.SPEARMAN)) class-attribute instance-attribute

Selection of correlation type.

feature_importance_method = field(converter=FIComputeMethod, validator=(validators.in_([FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP, FIComputeMethod.LOFO])), default=(FIComputeMethod.PERMUTATION)) class-attribute instance-attribute

FI method to filter from the dependency task's results. Only used when relevance_type is FROM_DEPENDENCY.

feature_importance_type = field(converter=MRMRFIAggregation, validator=(validators.in_(list(MRMRFIAggregation))), default=(MRMRFIAggregation.MEAN)) class-attribute instance-attribute

FI aggregation type. Only used when relevance_type is FROM_DEPENDENCY.

n_features = field(validator=[validators.instance_of(int)], default=(Factory(lambda: 30))) class-attribute instance-attribute

Number of features selected by MRMR.

relevance_type = field(converter=MRMRRelevance, validator=(validators.in_(list(MRMRRelevance))), default=(MRMRRelevance.FROM_DEPENDENCY)) class-attribute instance-attribute

Method to calculate relevance (permutation or f-statistics).

create_module()

Create MrmrModule execution instance.

Source code in octopus/modules/mrmr/module.py
def create_module(self) -> ModuleExecution:
    """Create MrmrModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import MrmrModule  # noqa: PLC0415

    return MrmrModule(config=self)

ResultType

Bases: StrEnum

Types of results produced by modules.

Source code in octopus/types.py
class ResultType(StrEnum):
    """Types of results produced by modules."""

    BEST = "best"
    ENSEMBLE_SELECTION = "ensemble_selection"

Roc

Bases: Task

ROC module for removing correlated features.

This module identifies groups of correlated features and selects the most informative feature from each group, removing the rest. Uses correlation analysis (Spearman or RDC) combined with feature relevance scoring (mutual information or F-statistics) to determine which features to keep.

Configuration

correlation_threshold: Correlation threshold above which features are considered correlated correlation_type: Type of correlation measure (CorrelationType.SPEARMAN or CorrelationType.RDC) relevance_method: Method to select best feature in group (RelevanceMethod.MUTUAL_INFO or RelevanceMethod.F_STATISTICS)

Source code in octopus/modules/roc/module.py
@define
class Roc(Task):
    """ROC module for removing correlated features.

    This module identifies groups of correlated features and selects the most
    informative feature from each group, removing the rest. Uses correlation
    analysis (Spearman or RDC) combined with feature relevance scoring (mutual
    information or F-statistics) to determine which features to keep.

    Configuration:
        correlation_threshold: Correlation threshold above which features are considered correlated
        correlation_type: Type of correlation measure (CorrelationType.SPEARMAN or CorrelationType.RDC)
        relevance_method: Method to select best feature in group (RelevanceMethod.MUTUAL_INFO or RelevanceMethod.F_STATISTICS)
    """

    correlation_threshold: float = field(validator=[validators.instance_of(float)], default=0.8)
    """Correlation threshold for feature removal (features with correlation > threshold are grouped)."""

    correlation_type: CorrelationType = field(
        converter=CorrelationType,
        validator=validators.in_([CorrelationType.SPEARMAN, CorrelationType.RDC]),
        default=CorrelationType.SPEARMAN,
    )
    """Selection of correlation type."""

    relevance_method: RelevanceMethod = field(
        converter=RelevanceMethod,
        validator=validators.in_([RelevanceMethod.MUTUAL_INFO, RelevanceMethod.F_STATISTICS]),
        default=RelevanceMethod.F_STATISTICS,
    )
    """Method to score feature relevance within correlated groups."""

    def create_module(self) -> ModuleExecution:
        """Create RocModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import RocModule  # noqa: PLC0415

        return RocModule(config=self)

correlation_threshold = field(validator=[validators.instance_of(float)], default=0.8) class-attribute instance-attribute

Correlation threshold for feature removal (features with correlation > threshold are grouped).

correlation_type = field(converter=CorrelationType, validator=(validators.in_([CorrelationType.SPEARMAN, CorrelationType.RDC])), default=(CorrelationType.SPEARMAN)) class-attribute instance-attribute

Selection of correlation type.

relevance_method = field(converter=RelevanceMethod, validator=(validators.in_([RelevanceMethod.MUTUAL_INFO, RelevanceMethod.F_STATISTICS])), default=(RelevanceMethod.F_STATISTICS)) class-attribute instance-attribute

Method to score feature relevance within correlated groups.

create_module()

Create RocModule execution instance.

Source code in octopus/modules/roc/module.py
def create_module(self) -> ModuleExecution:
    """Create RocModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import RocModule  # noqa: PLC0415

    return RocModule(config=self)

StudyContext

Immutable runtime context passed to modules during fit().

Contains only the finalized/prepared values needed by modules. No OctoStudy dependency - only attrs + upath.

Source code in octopus/modules/context.py
@frozen
class StudyContext:
    """Immutable runtime context passed to modules during fit().

    Contains only the finalized/prepared values needed by modules.
    No OctoStudy dependency - only attrs + upath.
    """

    ml_type: MLType
    """MLType enum (e.g. MLType.BINARY, MLType.REGRESSION, MLType.TIMETOEVENT)."""

    target_metric: str
    """Primary metric for model evaluation."""

    target_assignments: dict[str, str]
    """Target column assignments (e.g. {'default': 'target'} or {'duration': ..., 'event': ...})."""

    positive_class: int | None
    """Positive class label for binary classification. None for regression/multiclass."""

    stratification_col: str | None
    """Column used for stratification during data splitting."""

    sample_id_col: str
    """Identifier for sample instances."""

    feature_cols: list[str]
    """Prepared feature columns (from PreparedData.feature_cols)."""

    row_id_col: str
    """Prepared row identifier (from PreparedData.row_id_col)."""

    output_path: UPath
    """Full output path for this study."""

    log_dir: UPath
    """Directory where logs are stored."""

feature_cols instance-attribute

Prepared feature columns (from PreparedData.feature_cols).

log_dir instance-attribute

Directory where logs are stored.

ml_type instance-attribute

MLType enum (e.g. MLType.BINARY, MLType.REGRESSION, MLType.TIMETOEVENT).

output_path instance-attribute

Full output path for this study.

positive_class instance-attribute

Positive class label for binary classification. None for regression/multiclass.

row_id_col instance-attribute

Prepared row identifier (from PreparedData.row_id_col).

sample_id_col instance-attribute

Identifier for sample instances.

stratification_col instance-attribute

Column used for stratification during data splitting.

target_assignments instance-attribute

Target column assignments (e.g. {'default': 'target'} or {'duration': ..., 'event': ...}).

target_metric instance-attribute

Primary metric for model evaluation.

Tako

Bases: Task

Tako module for feature selection and model optimization.

Uses Optuna for hyperparameter optimization with cross-validation, supporting: - Multiple ML models - MRMR feature selection - Ensemble selection - Bag-based model ensembling

Configuration

models: List of model names to optimize n_inner_splits: Number of inner CV splits n_trials: Number of Optuna trials ensemble_selection: Whether to perform ensemble selection n_mrmr_features: Number-of-feature options for MRMR-based Optuna search

Source code in octopus/modules/tako/module.py
@define
class Tako(Task):
    """Tako module for feature selection and model optimization.

    Uses Optuna for hyperparameter optimization with cross-validation, supporting:
    - Multiple ML models
    - MRMR feature selection
    - Ensemble selection
    - Bag-based model ensembling

    Configuration:
        models: List of model names to optimize
        n_inner_splits: Number of inner CV splits
        n_trials: Number of Optuna trials
        ensemble_selection: Whether to perform ensemble selection
        n_mrmr_features: Number-of-feature options for MRMR-based Optuna search
    """

    models: list[ModelName] | None = field(
        default=None,
        converter=_convert_models,
    )
    """Models for ML. If None, defaults are resolved at fit time based on ml_type."""

    n_inner_splits: int = field(validator=[validators.instance_of(int)], default=5)
    """Number of inner splits."""

    inner_split_seeds: list[int] = field(
        default=Factory(lambda: [0]),
        validator=validators.deep_iterable(
            member_validator=validators.instance_of(int),
            iterable_validator=validators.instance_of(list),
        ),
    )
    """List of integers used as seeds for data splitting."""

    max_outliers: int = field(validator=[validators.instance_of(int)], default=3)
    """Maximum number of outliers, optimized by Optuna"""

    fi_methods: list[FIComputeMethod] = field(
        default=Factory(lambda: [FIComputeMethod.PERMUTATION]),
        converter=lambda vs: [FIComputeMethod(v) for v in vs],
        validator=validators.deep_iterable(
            member_validator=validators.in_(
                [FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP, FIComputeMethod.CONSTANT]
            ),
            iterable_validator=validators.instance_of(list),
        ),
    )
    """Feature importance methods for best bag."""

    n_startup_trials: int = field(validator=[validators.instance_of(int)], default=15)
    """Number of Optuna startup trials (random sampler)"""

    ensemble_selection: bool = field(validator=[validators.in_([True, False])], default=False)
    """Whether to perform ensemble selection."""

    n_ensemble_candidates: int = field(validator=[validators.instance_of(int), validators.ge(1)], default=50)
    """Number of top-performing bags to keep as candidates for ensemble selection."""

    n_trials: int = field(validator=[validators.instance_of(int)], default=200 if not _RUNNING_IN_TESTSUITE else 3)
    """Number of Optuna trials."""

    hyperparameters: dict = field(validator=[validators.instance_of(dict)], default=Factory(dict))
    """Bring own hyperparameter space."""

    max_features: int = field(validator=[validators.instance_of(int)], default=0)
    """Maximum features to constrain hyperparameter optimization. Default is zero (off)."""

    penalty_factor: float = field(validator=[validators.instance_of(float)], default=1.0)
    """Penalty multiplier for the feature-count constraint in Optuna optimization.

    When ``max_features > 0``, Optuna penalises trials that use more features
    than allowed::

        penalty = penalty_factor * excess_features / total_features

    This penalty is subtracted from the optimisation target in the same numeric
    space as the target metric.  The default of ``1.0`` works well for metrics
    bounded between 0 and 1 (AUCROC, ACCBAL, R2, …).  For metrics on a larger
    scale (MAE, MSE, RMSE, …) the penalty becomes negligible relative to the
    score and feature constraining has no effect.  In that case, increase
    ``penalty_factor`` to match the metric's magnitude — e.g. if MAE ≈ 100,
    try ``penalty_factor=100.0``.
    """

    n_mrmr_features: list[int] = field(validator=[validators.instance_of(list)], default=Factory(list))
    """Number-of-feature options for MRMR pre-selection during Optuna optimization.

    Each integer specifies a number of top features to pre-select via MRMR
    (Max-Relevance Min-Redundancy). The resulting subsets become an additional
    Optuna hyperparameter, so each trial may use a different subset size.
    The full feature set is always included as an option.

    Example: ``[10, 20, 50]`` pre-computes the top-10, top-20, and top-50
    MRMR features; Optuna then chooses among these three subsets plus all
    features.  An empty list (default) disables MRMR and uses all features
    in every trial.
    """

    scoring_method: ScoringMethod = field(
        default=ScoringMethod.COMBINED,
        converter=ScoringMethod,
        validator=validators.in_(list(ScoringMethod)),
    )
    """How to calculate the bag performance for the optuna optimization target."""

    def __attrs_post_init__(self):
        # Only enforce constrained-HPO compatibility when max_features > 0 and models are specified
        if self.max_features > 0 and self.models is not None:
            incompatible_models: list[ModelName] = []

            for m in self.models:
                config = Models.get_config(m)
                chpo_flag = config.chpo_compatible
                logger.info(f"Model '{m}': chpo_compatible={chpo_flag}")

                if not chpo_flag:
                    incompatible_models.append(m)

            if incompatible_models:
                msg = (
                    "Tako: The following models are not compatible with constrained HPO. "
                    "Please remove those model or turn constrained HPO off (max_features=0): "
                    + ", ".join(incompatible_models)
                )
                logger.error(msg)
                raise ValueError(msg)

    def create_module(self) -> ModuleExecution:
        """Create TakoModule execution instance."""
        # import only during execution to avoid heavy dependency at config stage
        from .core import TakoModuleTemplate  # noqa: PLC0415

        return TakoModuleTemplate(config=self)

ensemble_selection = field(validator=[validators.in_([True, False])], default=False) class-attribute instance-attribute

Whether to perform ensemble selection.

fi_methods = field(default=(Factory(lambda: [FIComputeMethod.PERMUTATION])), converter=(lambda vs: [(FIComputeMethod(v)) for v in vs]), validator=(validators.deep_iterable(member_validator=(validators.in_([FIComputeMethod.PERMUTATION, FIComputeMethod.SHAP, FIComputeMethod.CONSTANT])), iterable_validator=(validators.instance_of(list))))) class-attribute instance-attribute

Feature importance methods for best bag.

hyperparameters = field(validator=[validators.instance_of(dict)], default=(Factory(dict))) class-attribute instance-attribute

Bring own hyperparameter space.

inner_split_seeds = field(default=(Factory(lambda: [0])), validator=(validators.deep_iterable(member_validator=(validators.instance_of(int)), iterable_validator=(validators.instance_of(list))))) class-attribute instance-attribute

List of integers used as seeds for data splitting.

max_features = field(validator=[validators.instance_of(int)], default=0) class-attribute instance-attribute

Maximum features to constrain hyperparameter optimization. Default is zero (off).

max_outliers = field(validator=[validators.instance_of(int)], default=3) class-attribute instance-attribute

Maximum number of outliers, optimized by Optuna

models = field(default=None, converter=_convert_models) class-attribute instance-attribute

Models for ML. If None, defaults are resolved at fit time based on ml_type.

n_ensemble_candidates = field(validator=[validators.instance_of(int), validators.ge(1)], default=50) class-attribute instance-attribute

Number of top-performing bags to keep as candidates for ensemble selection.

n_inner_splits = field(validator=[validators.instance_of(int)], default=5) class-attribute instance-attribute

Number of inner splits.

n_mrmr_features = field(validator=[validators.instance_of(list)], default=(Factory(list))) class-attribute instance-attribute

Number-of-feature options for MRMR pre-selection during Optuna optimization.

Each integer specifies a number of top features to pre-select via MRMR (Max-Relevance Min-Redundancy). The resulting subsets become an additional Optuna hyperparameter, so each trial may use a different subset size. The full feature set is always included as an option.

Example: [10, 20, 50] pre-computes the top-10, top-20, and top-50 MRMR features; Optuna then chooses among these three subsets plus all features. An empty list (default) disables MRMR and uses all features in every trial.

n_startup_trials = field(validator=[validators.instance_of(int)], default=15) class-attribute instance-attribute

Number of Optuna startup trials (random sampler)

n_trials = field(validator=[validators.instance_of(int)], default=(200 if not _RUNNING_IN_TESTSUITE else 3)) class-attribute instance-attribute

Number of Optuna trials.

penalty_factor = field(validator=[validators.instance_of(float)], default=1.0) class-attribute instance-attribute

Penalty multiplier for the feature-count constraint in Optuna optimization.

When max_features > 0, Optuna penalises trials that use more features than allowed::

penalty = penalty_factor * excess_features / total_features

This penalty is subtracted from the optimisation target in the same numeric space as the target metric. The default of 1.0 works well for metrics bounded between 0 and 1 (AUCROC, ACCBAL, R2, …). For metrics on a larger scale (MAE, MSE, RMSE, …) the penalty becomes negligible relative to the score and feature constraining has no effect. In that case, increase penalty_factor to match the metric's magnitude — e.g. if MAE ≈ 100, try penalty_factor=100.0.

scoring_method = field(default=(ScoringMethod.COMBINED), converter=ScoringMethod, validator=(validators.in_(list(ScoringMethod)))) class-attribute instance-attribute

How to calculate the bag performance for the optuna optimization target.

create_module()

Create TakoModule execution instance.

Source code in octopus/modules/tako/module.py
def create_module(self) -> ModuleExecution:
    """Create TakoModule execution instance."""
    # import only during execution to avoid heavy dependency at config stage
    from .core import TakoModuleTemplate  # noqa: PLC0415

    return TakoModuleTemplate(config=self)

Task

Bases: ABC

Base config class for all workflow tasks.

Source code in octopus/modules/base.py
@define
class Task(ABC):
    """Base config class for all workflow tasks."""

    task_id: int = field(validator=[validators.instance_of(int), validators.ge(0)])
    depends_on: int | None = field(default=None, validator=validators.optional(validators.instance_of(int)))
    description: str = field(default="", validator=[validators.instance_of(str)])
    categorical_encoding: bool = field(default=False, validator=[validators.instance_of(bool)])

    @property
    def module(self) -> str:
        """Module name derived from class name."""
        return type(self).__name__.lower()

    @abstractmethod
    def create_module(self) -> ModuleExecution:
        """Create an execution module from this config."""
        raise NotImplementedError("Subclasses must implement create_module()")

module property

Module name derived from class name.

create_module() abstractmethod

Create an execution module from this config.

Source code in octopus/modules/base.py
@abstractmethod
def create_module(self) -> ModuleExecution:
    """Create an execution module from this config."""
    raise NotImplementedError("Subclasses must implement create_module()")

rdc_correlation_matrix(df)

Calculate RDC correlation matrix.

Source code in octopus/modules/utils.py
def rdc_correlation_matrix(df):
    """Calculate RDC correlation matrix."""
    features = df.columns
    n_features = len(features)
    rdc_matrix = np.zeros((n_features, n_features))

    # Calculate RDC for each pair of features
    for i in range(n_features):
        for j in range(i, n_features):
            if i == j:
                rdc_matrix[i, j] = 1.0
            else:
                rdc_value = rdc(df.iloc[:, i].values, df.iloc[:, j].values)
                rdc_matrix[i, j] = rdc_value
                rdc_matrix[j, i] = rdc_value

    return rdc_matrix