Wf octo autogluon
This example demonstrates how to use Octopus with both Octo and AutoGluon modules in a PARALLEL workflow for binary classification. In this case, both modules are run on the same input data.
The workflow includes: 1. Octo module 2. AutoGluon module Both modules operate on the same base input data.
from octopus.modules import AutoGluon, Tako
from octopus.study import OctoClassification
from octopus.types import FIComputeMethod, ModelName
### Generate Synthetic Binary Classification Dataset
n_informative = 30
n_redundant = 30
n_repeated = 0
X, y = make_classification(
n_samples=300,
n_features=1000,
n_informative=n_informative,
n_redundant=n_redundant, # generated as random linear combinations of the informative features
n_repeated=n_repeated, # drawn randomly from the informative and the redundant features.
n_classes=2,
class_sep=1.0, # Controls class separability (higher = easier)
weights=[0.5, 0.5], # 60% class 0, 40% class 1
flip_y=0.01, # Add 1% label noise for realism
random_state=42,
shuffle=False, # ensure order of features
)
# Create a pandas DataFrame with proper structure
# Without shuffling, features are ordered: informative, redundant, repeated, then noise
feature_names = []
# Informative features (first n_informative)
feature_names.extend([f"informative_{i}" for i in range(n_informative)])
# Redundant features (next n_redundant)
feature_names.extend([f"redundant_{i}" for i in range(n_redundant)])
# Repeated features (next n_repeated)
if n_repeated > 0:
feature_names.extend([f"repeated_{i}" for i in range(n_repeated)])
# Remaining features are noise
n_noise = X.shape[1] - n_informative - n_redundant - n_repeated
feature_names.extend([f"noise_{i}" for i in range(n_noise)])
# Display dataset information
print("=== Synthetic Dataset Information ===")
print(f"Dataset shape: {df.shape}")
print(f"Features: {len(feature_names)}")
print(f"Class distribution:\n{df['target'].value_counts()}")
print(f"Class balance: {df['target'].value_counts(normalize=True).to_dict()}")
print("=====================================\n")
Create and run OctoClassification with PARALLEL Octo + AutoGluon workflow
study = OctoClassification(
study_name="wf_octo_autogluon",
studies_directory=os.environ.get("STUDIES_PATH", "./studies"),
target_metric="AUCROC", # Area Under ROC Curve for binary classification
feature_cols=feature_names,
target_col="target",
sample_id_col="index",
stratification_col="target", # Ensure balanced splits
n_outer_splits=5, # 5-split outer cross-validation
workflow=[
# Step 0: octo
Tako(
description="step_0_tako",
task_id=0,
depends_on=None, # No dependency (parallel with AutoGluon)
# Cross-validation settings
n_inner_splits=5,
# Model selection - using tree-based models for feature importance
models=[
ModelName.ExtraTreesClassifier,
],
fi_methods=[FIComputeMethod.PERMUTATION], # Feature importance method
n_trials=100, # Number of hyperparameter optimization trials
# Constrained hyperparameter optimization
# max_features=60, # Maximum number of features to select
),
# Step 1: AutoGluon
AutoGluon(
description="step_1_autogluon",
task_id=1,
depends_on=None, # No dependency (parallel with Octo)
time_limit=600,
presets=["medium_quality"], # Balance between speed and accuracy
n_bag_splits=5, # 5-split bagging for ensemble models
included_model_types=[
"XT", # ExtraTrees
],
),
],
)