Wf octo mrmr octo

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

from octopus.modules import Mrmr, Octo
from octopus.study import OctoClassification
from octopus.types import CorrelationType, ModelName

# Set random seed for reproducibility
np.random.seed(42)

# Create artificial classification dataset
# Parameters chosen to make the problem "not too easy":
# - 30 features total
# - Only 15 informative features (50%)
# - 10 redundant features (correlated with informative)
# - 5 repeated features (duplicates)
# - class_sep=0.5 for moderate difficulty (lower = harder)
# - flip_y=0.1 to add 10% label noise
X, y = make_classification(
    n_samples=500,
    n_features=30,
    n_informative=15,
    n_redundant=10,
    n_repeated=5,
    n_classes=2,
    n_clusters_per_class=3,
    weights=[0.6, 0.4],  # Imbalanced classes
    flip_y=0.1,  # 10% label noise
    class_sep=0.5,  # Moderate class separation (not too easy)
    random_state=42,
)

# Create DataFrame with feature names
feature_names = [f"feature_{i:02d}" for i in range(30)]
df = pd.DataFrame(X, columns=feature_names)
df["target"] = y
df = df.reset_index()

print("Dataset created:")
print(f"  Samples: {len(df)}")
print(f"  Features: {len(feature_names)}")
print(f"  Class distribution: {df['target'].value_counts().to_dict()}")
print()

# Create and run OctoClassification with sequential multi-step workflow
study = OctoClassification(
    name="wf_octo_mrmr_octo",
    target_metric="ACCBAL",
    feature_cols=feature_names,
    target_col="target",
    sample_id_col="index",
    stratification_col="target",
    n_folds_outer=5,  # 5 outer folds
    ignore_data_health_warning=True,
    outer_parallelization=True,  # Run all outer folds in parallel
    workflow=[
        # Task 0: Initial Octo with all features
        Octo(
            description="step1_octo_full",
            task_id=0,
            depends_on=None,  # First task, depends on input
            models=[ModelName.ExtraTreesClassifier],
            n_trials=100,  # 100 trials for hyperparameter optimization
            n_folds_inner=5,  # 5 inner folds
            max_features=30,  # Use all 30 features
        ),
        # Task 1: Feature selection using Mrmr
        Mrmr(
            description="step2_mrmr",
            task_id=1,
            depends_on=0,
            n_features=15,  # Select top 15 features
            correlation_type=CorrelationType.SPEARMAN,
        ),
        # Task 2: Octo with reduced features
        Octo(
            description="step3_octo_reduced",
            task_id=2,
            depends_on=1,
            models=[ModelName.ExtraTreesClassifier],
            n_trials=100,
            n_folds_inner=5,
            ensemble_selection=True,
        ),
    ],
)

print("Starting workflow execution...")

study.fit(data=df)

print("Workflow completed successfully!")
print(f"Results saved to: {study.output_path}")