Wf roc octo
This example demonstrates how to use Octopus with ROC (Remove Outliers and Correlations) and Octo modules for binary classification on the breast cancer dataset. The workflow includes: 1. ROC module for feature correlation analysis and filtering 2. Octo module for model training and hyperparameter optimization
from octopus.example_data import load_breast_cancer_data
from octopus.modules import Octo, Roc
from octopus.study import OctoClassification
from octopus.types import CorrelationType, FIComputeMethod, ModelName, ROCFilterMethod
Load and Preprocess Data
Load the breast cancer dataset from sklearn This is a binary classification dataset with 30 features Target: 0 = malignant, 1 = benign
print("Dataset info:")
print(f" Features: {len(features)} - {features}")
print(f" Samples: {df.shape[0]}")
print(f" Classes: {len(targets)} - {targets}")
print(f" Target distribution: {df['target'].value_counts().sort_index().to_dict()}")
Create and run OctoClassification with ROC + Octo workflow
study = OctoClassification(
name="example_roc_octo",
path=os.environ.get("STUDIES_PATH", "./studies"),
target_metric="ACCBAL", # Balanced accuracy for binary classification
feature_cols=features,
target_col="target",
sample_id_col="index",
stratification_col="target",
datasplit_seed_outer=1234,
ignore_data_health_warning=True,
outer_parallelization=True,
run_single_outersplit_num=0, # Process only first outersplit for quick testing
workflow=[
# Step 0: ROC - Remove highly correlated features and apply statistical filtering
Roc(
description="step_0_roc",
task_id=0,
depends_on=None, # First step, no input dependency
threshold=0.85, # Remove features with correlation > 0.85
correlation_type=CorrelationType.SPEARMAN, # Use Spearman correlation
filter_type=ROCFilterMethod.F_STATISTICS, # Apply F-statistics filtering
),
# Step 1: Octo - Train models on filtered features from ROC step
Octo(
description="step_1_octo",
task_id=1,
depends_on=0, # Use output from ROC step
# Cross-validation settings
n_folds_inner=5,
# Model selection
models=[
ModelName.ExtraTreesClassifier,
# ModelName.RandomForestClassifier,
],
model_seed=0,
n_jobs=1,
max_outl=0, # No outlier removal
fi_methods_bestbag=[FIComputeMethod.PERMUTATION], # Feature importance method
# Parallelization settings
inner_parallelization=True,
n_workers=5,
# Hyperparameter optimization with Optuna
optuna_seed=0,
n_optuna_startup_trials=10,
n_trials=12, # Number of hyperparameter optimization trials
max_features=12, # Maximum number of features to select
penalty_factor=1.0,
),
],
)