diff --git a/src/factorlab/analytics/attribution.py b/src/factorlab/analytics/attribution.py index bd5e5bd..08ef0bf 100644 --- a/src/factorlab/analytics/attribution.py +++ b/src/factorlab/analytics/attribution.py @@ -5,7 +5,7 @@ from statsmodels.tsa.tsatools import add_trend from factorlab.feature_engineering.transformations import Transform -from factorlab.forecasting.time_series_analysis import rolling_window, expanding_window +from factorlab.learning.time_series_analysis import rolling_window, expanding_window class FactorModel: diff --git a/src/factorlab/analytics/metrics.py b/src/factorlab/analytics/metrics.py index c1d0b66..f13b68e 100644 --- a/src/factorlab/analytics/metrics.py +++ b/src/factorlab/analytics/metrics.py @@ -3,7 +3,7 @@ import inspect from typing import Union, Optional, List -from factorlab.forecasting.time_series_analysis import TimeSeriesAnalysis as TSA +from factorlab.learning.time_series_analysis import TimeSeriesAnalysis as TSA class Metrics: diff --git a/src/factorlab/backtesting/engine.py b/src/factorlab/backtesting/engine.py index d067d84..a898c7b 100644 --- a/src/factorlab/backtesting/engine.py +++ b/src/factorlab/backtesting/engine.py @@ -2,7 +2,8 @@ import numpy as np from typing import Dict -from factorlab.strategy.strategy import Strategy +from factorlab.strategy.strategy import StrategySpec +from factorlab.core.walk_forward_runner import WalkForwardRunner from factorlab.execution.rebalancer import Rebalancer @@ -15,7 +16,7 @@ class BacktestEngine: """ def __init__(self, - config: StrategyConfig, + config: StrategySpec, data: pd.DataFrame, initial_capital: float = 1_000_000.0, verbose: bool = True, @@ -54,6 +55,7 @@ def __init__(self, self.gross_exposure = None self.portfolio_return = None self.results: Dict[str, pd.DataFrame] = {} + self.walk_forward_runner = None # Frequency map for runtime checks self.freq_map = {'monday': 0, 'tuesday': 1, 'wednesday': 2, 'thursday': 3, 'friday': 4} @@ -70,8 +72,27 @@ def _compute_pipeline(self): """ print("--- Computing data pipeline... ---") - # run the full data pipeline - self.pipeline = self.config.data_pipeline.fit_transform(self.data) + learning_spec = getattr(self.config, "learning_spec", None) + if learning_spec is None: + # run the full data pipeline + self.pipeline = self.config.data_pipeline.fit_transform(self.data) + else: + y = learning_spec.y + if isinstance(y, str): + if y not in self.data.columns: + raise ValueError(f"learning_spec.y='{y}' is not a column in input data.") + y = self.data[y] + + self.walk_forward_runner = WalkForwardRunner( + pipeline=self.config.data_pipeline, + splitter=learning_spec.splitter, + y=y, + target_spec=learning_spec.target_spec, + strict_temporal=learning_spec.strict_temporal, + date_level=learning_spec.date_level, + show_progress=learning_spec.show_progress, + ) + self.pipeline = self.walk_forward_runner.run(self.data) print("--- Data pipeline complete. ---\n") diff --git a/src/factorlab/core/walk_forward_runner.py b/src/factorlab/core/walk_forward_runner.py new file mode 100644 index 0000000..f7cdf02 --- /dev/null +++ b/src/factorlab/core/walk_forward_runner.py @@ -0,0 +1,189 @@ +from __future__ import annotations + +from copy import deepcopy +from dataclasses import dataclass +from typing import Optional, Union + +import numpy as np +import pandas as pd +from tqdm import tqdm + +from factorlab.core.pipeline import Pipeline +from factorlab.learning.splitters import BasePanelSplit +from factorlab.learning.utils import extract_dates +from factorlab.targets.forward import ForwardTargetSpec + + +@dataclass +class FoldRunInfo: + fold: int + train_start: pd.Timestamp + train_end: pd.Timestamp + test_start: pd.Timestamp + test_end: pd.Timestamp + n_train: int + n_test: int + lookahead: int + embargo: int + horizon: int + n_trainable: Optional[int] = None + + +class WalkForwardRunner: + """ + Execute a Pipeline with explicit train/test fold boundaries. + + This runner is strategy-level orchestration: each fold is fitted on train rows and + transformed up to the fold test end date, then only test rows are collected. + """ + + def __init__( + self, + pipeline: Pipeline, + splitter: BasePanelSplit, + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + target_spec: Optional[ForwardTargetSpec] = None, + strict_temporal: bool = True, + date_level: int = 0, + show_progress: bool = False, + ): + if y is not None and target_spec is not None: + raise ValueError("Provide either y or target_spec, not both.") + + self.pipeline = pipeline + self.splitter = splitter + self.y = y + self.target_spec = target_spec + self.strict_temporal = strict_temporal + self.date_level = date_level + self.show_progress = show_progress + + self.fold_info: list[FoldRunInfo] = [] + self.output: Optional[pd.DataFrame] = None + + @staticmethod + def _clone_pipeline_steps(pipeline: Pipeline) -> list[tuple[str, object]]: + return [(name, deepcopy(transformer)) for name, transformer in pipeline.steps] + + @staticmethod + def _count_valid_targets(y: Union[pd.Series, pd.DataFrame]) -> int: + if isinstance(y, pd.DataFrame): + return int(y.notna().all(axis=1).sum()) + return int(y.notna().sum()) + + def _resolve_fold_target( + self, + X_context: pd.DataFrame, + train_index: pd.Index, + ) -> Optional[Union[pd.Series, pd.DataFrame]]: + if self.target_spec is not None: + y_context = self.target_spec.build(X_context) + trainable = self.target_spec.trainable_mask( + index=X_context.index, + train_index=train_index, + date_level=self.date_level, + ) + return y_context.where(trainable) + + if self.y is None: + return None + + y_context = self.y.reindex(X_context.index) + if isinstance(y_context, pd.DataFrame): + if y_context.shape[1] != 1: + raise ValueError("WalkForwardRunner currently supports a single y target column.") + return y_context + + def _fit_transform_fold( + self, + X: pd.DataFrame, + train_index: pd.Index, + test_index: pd.Index, + ) -> tuple[pd.DataFrame, Optional[int]]: + dates = extract_dates(X.index, date_level=self.date_level) + test_dates = pd.DatetimeIndex(test_index.get_level_values(self.date_level) if isinstance(test_index, pd.MultiIndex) else test_index) + test_end = pd.Timestamp(test_dates.max()) + + context_mask = dates <= test_end + Xt = X.loc[context_mask].copy(deep=True) + y_context = self._resolve_fold_target(X_context=Xt, train_index=train_index) + n_trainable = None + + steps = self._clone_pipeline_steps(self.pipeline) + for _, transformer in steps: + train_mask = Xt.index.isin(train_index) + X_train = Xt.loc[train_mask] + + if y_context is None: + transformer.fit(X_train) + else: + y_train = y_context.reindex(X_train.index) + n_trainable = self._count_valid_targets(y_train) + transformer.fit(X_train, y_train) + + Xt = transformer.transform(Xt) + if y_context is not None and not y_context.index.equals(Xt.index): + y_context = y_context.reindex(Xt.index) + + return Xt.loc[test_index], n_trainable + + def run(self, X: pd.DataFrame) -> pd.DataFrame: + if not isinstance(X, pd.DataFrame): + raise TypeError("WalkForwardRunner expects X as a pandas DataFrame.") + if self.y is not None and not isinstance(self.y, (pd.Series, pd.DataFrame)): + raise TypeError("y must be a pandas Series or DataFrame when provided.") + if self.y is not None and not X.index.equals(self.y.index): + raise ValueError("WalkForwardRunner requires y to share the exact same index as X.") + if self.strict_temporal and self.target_spec is not None: + if int(getattr(self.splitter, "lookahead", 0)) < int(self.target_spec.horizon): + raise ValueError( + "Splitter lookahead is shorter than target_spec.horizon. " + "Increase lookahead or reduce horizon to avoid temporal leakage." + ) + + out = X.copy(deep=True) + self.fold_info = [] + + split_y = None if self.target_spec is not None else self.y + splits = list(self.splitter.split(X, split_y)) + iterator = tqdm(enumerate(splits), total=len(splits), desc="WalkForward Runner", disable=not self.show_progress) + + for fold_id, (train_idx, test_idx) in iterator: + if len(train_idx) == 0 or len(test_idx) == 0: + continue + + train_index = X.iloc[train_idx].index + test_index = X.iloc[test_idx].index + + train_dates = extract_dates(train_index, date_level=self.date_level) + test_dates = extract_dates(test_index, date_level=self.date_level) + if pd.Timestamp(train_dates.max()) >= pd.Timestamp(test_dates.min()): + raise ValueError( + "Temporal integrity violation: train_end must be strictly earlier than test_start." + ) + + fold_out, n_trainable = self._fit_transform_fold(X=X, train_index=train_index, test_index=test_index) + + for col in fold_out.columns: + if col not in out.columns: + out[col] = np.nan + out.loc[test_index, fold_out.columns] = fold_out + + self.fold_info.append( + FoldRunInfo( + fold=fold_id, + train_start=pd.Timestamp(train_dates.min()), + train_end=pd.Timestamp(train_dates.max()), + test_start=pd.Timestamp(test_dates.min()), + test_end=pd.Timestamp(test_dates.max()), + n_train=int(len(train_idx)), + n_test=int(len(test_idx)), + lookahead=int(getattr(self.splitter, "lookahead", 0)), + embargo=int(getattr(self.splitter, "embargo", 0)), + horizon=int(self.target_spec.horizon if self.target_spec is not None else 0), + n_trainable=n_trainable, + ) + ) + + self.output = out + return out diff --git a/src/factorlab/learning/__init__.py b/src/factorlab/learning/__init__.py index e69de29..edf459e 100644 --- a/src/factorlab/learning/__init__.py +++ b/src/factorlab/learning/__init__.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +from typing import Any + +from factorlab.learning.supervised.learners import ( + CatBoostClassifierLearner, + CatBoostRegressorLearner, + ClassificationLearner, + ElasticNetLearner, + LassoLearner, + LinearRegressionLearner, + LogisticRegressionCVLearner, + LogisticRegressionLearner, + MLPClassifierLearner, + MLPRegressorLearner, + RegressionLearner, + RandomForestClassifierLearner, + RandomForestRegressorLearner, + RidgeLearner, + SupervisedPCALearner, + TorchRegressorLearner, + XGBoostClassifierLearner, + XGBoostRegressorLearner, +) +from factorlab.learning.supervised.walk_forward_learner import WalkForwardLearner +from factorlab.learning.selectors import FeatureSelector +from factorlab.learning.splitters import ( + BasePanelSplit, + ExpandingFrequencyPanelSplit, + ExpandingIncrementPanelSplit, + ExpandingKFoldPanelSplit, + RollingKFoldPanelSplit, +) +from factorlab.learning.time_series_analysis import ( + LagFeatures, + StatsmodelsOLSLearner, + TimeSeriesAnalysis, + TimeSeriesDiagnostics, + add_lags, + expanding_window, + rolling_window, +) +from factorlab.learning.unsupervised.unsupervised_learning import ( + PCATransform, + PPCATransform, + R2PCATransform, +) + +__all__ = [ + "BasePanelSplit", + "ExpandingKFoldPanelSplit", + "RollingKFoldPanelSplit", + "ExpandingIncrementPanelSplit", + "ExpandingFrequencyPanelSplit", + "WalkForwardLearner", + "FeatureSelector", + "WalkForwardGridSearch", + "parameter_grid", + "set_pipeline_params", + "RegressionLearner", + "ClassificationLearner", + "LinearRegressionLearner", + "RidgeLearner", + "LassoLearner", + "ElasticNetLearner", + "RandomForestRegressorLearner", + "CatBoostRegressorLearner", + "RandomForestClassifierLearner", + "LogisticRegressionLearner", + "LogisticRegressionCVLearner", + "XGBoostClassifierLearner", + "CatBoostClassifierLearner", + "XGBoostRegressorLearner", + "MLPRegressorLearner", + "MLPClassifierLearner", + "TorchRegressorLearner", + "SupervisedPCALearner", + "add_lags", + "rolling_window", + "expanding_window", + "LagFeatures", + "StatsmodelsOLSLearner", + "TimeSeriesAnalysis", + "TimeSeriesDiagnostics", + "PCATransform", + "PPCATransform", + "R2PCATransform", +] + + +def __getattr__(name: str) -> Any: + """ + Lazily expose search utilities to avoid import cycles with targets.forward. + """ + if name in {"WalkForwardGridSearch", "parameter_grid", "set_pipeline_params"}: + from factorlab.learning.search import ( + WalkForwardGridSearch, + parameter_grid, + set_pipeline_params, + ) + + lazy_exports = { + "WalkForwardGridSearch": WalkForwardGridSearch, + "parameter_grid": parameter_grid, + "set_pipeline_params": set_pipeline_params, + } + value = lazy_exports[name] + globals()[name] = value + return value + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") diff --git a/src/factorlab/learning/base.py b/src/factorlab/learning/base.py index e69de29..8c5e471 100644 --- a/src/factorlab/learning/base.py +++ b/src/factorlab/learning/base.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Optional, Union + +import pandas as pd + +from factorlab.core.base_transform import BaseTransform + + +class SupervisedLearner(BaseTransform, ABC): + """ + Base protocol for supervised learner steps used inside the Pipeline. + """ + + @abstractmethod + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "SupervisedLearner": + raise NotImplementedError + + @abstractmethod + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + raise NotImplementedError diff --git a/src/factorlab/learning/model_registry.py b/src/factorlab/learning/model_registry.py new file mode 100644 index 0000000..c6893dc --- /dev/null +++ b/src/factorlab/learning/model_registry.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +from copy import deepcopy +from typing import Any, Optional + +import numpy as np +import pandas as pd +from sklearn.base import clone +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import ( + ElasticNet, + ElasticNetCV, + Lars, + LarsCV, + Lasso, + LassoCV, + LassoLars, + LassoLarsCV, + LassoLarsIC, + LinearRegression, + LogisticRegression, + LogisticRegressionCV, + Ridge, + RidgeCV, +) + + +def _safe_clone(model: Any) -> Any: + try: + return clone(model) + except Exception: + return deepcopy(model) + + +def build_regressor( + method: str, + n_features: Optional[int] = None, + random_state: int = 42, + custom_model: Optional[Any] = None, + method_kwargs: Optional[dict[str, Any]] = None, +) -> Any: + """ + Build a regressor by canonical method string. + """ + kwargs = dict(method_kwargs or {}) + method = method.lower() + + if custom_model is not None: + return _safe_clone(custom_model) + if method in {"ols", "linear_regression"}: + return LinearRegression(**kwargs) + if method == "lasso": + return Lasso(**kwargs) + if method == "lasso_cv": + return LassoCV(**kwargs) + if method == "lasso_lars": + return LassoLars(**kwargs) + if method == "lasso_lars_cv": + return LassoLarsCV(**kwargs) + if method == "lasso_lars_ic": + return LassoLarsIC(**kwargs) + if method == "lars": + if n_features is not None and "n_nonzero_coefs" not in kwargs: + kwargs["n_nonzero_coefs"] = int(n_features) + return Lars(**kwargs) + if method == "lars_cv": + return LarsCV(**kwargs) + if method == "ridge": + return Ridge(**kwargs) + if method == "ridge_cv": + return RidgeCV(**kwargs) + if method == "elastic_net": + return ElasticNet(**kwargs) + if method == "elastic_net_cv": + return ElasticNetCV(**kwargs) + if method == "random_forest": + kwargs.setdefault("random_state", random_state) + return RandomForestRegressor(**kwargs) + if method == "xgboost": + try: + from xgboost import XGBRegressor + except Exception as exc: + raise ImportError("xgboost is required for method='xgboost'.") from exc + kwargs.setdefault("random_state", random_state) + return XGBRegressor(**kwargs) + if method == "catboost": + try: + from catboost import CatBoostRegressor + except Exception as exc: + raise ImportError("catboost is required for method='catboost'.") from exc + kwargs.setdefault("random_seed", random_state) + kwargs.setdefault("verbose", False) + return CatBoostRegressor(**kwargs) + + raise ValueError(f"Unknown regression method '{method}'.") + + +def build_classifier( + method: str, + random_state: int = 42, + custom_model: Optional[Any] = None, + method_kwargs: Optional[dict[str, Any]] = None, +) -> Any: + """ + Build a classifier by canonical method string. + """ + kwargs = dict(method_kwargs or {}) + method = method.lower() + + if custom_model is not None: + return _safe_clone(custom_model) + if method in {"logit", "logistic", "logistic_regression"}: + return LogisticRegression(**kwargs) + if method in {"logit_cv", "logistic_cv", "logistic_regression_cv"}: + return LogisticRegressionCV(**kwargs) + if method in {"random_forest", "random_forest_classifier"}: + kwargs.setdefault("random_state", random_state) + return RandomForestClassifier(**kwargs) + if method in {"xgboost", "xgboost_classifier"}: + try: + from xgboost import XGBClassifier + except Exception as exc: + raise ImportError("xgboost is required for method='xgboost_classifier'.") from exc + kwargs.setdefault("random_state", random_state) + return XGBClassifier(**kwargs) + if method in {"catboost", "catboost_classifier"}: + try: + from catboost import CatBoostClassifier + except Exception as exc: + raise ImportError("catboost is required for method='catboost_classifier'.") from exc + kwargs.setdefault("random_seed", random_state) + kwargs.setdefault("verbose", False) + return CatBoostClassifier(**kwargs) + + raise ValueError(f"Unknown classification method '{method}'.") + + +def extract_feature_importance(model: Any, feature_names: list[str]) -> pd.DataFrame: + """ + Extract feature importance/coefficient ranking as DataFrame. + """ + if hasattr(model, "feature_importances_"): + vals = np.asarray(model.feature_importances_, dtype=float) + elif hasattr(model, "coef_"): + coef = np.asarray(model.coef_, dtype=float) + if coef.ndim == 1: + vals = coef + else: + vals = np.abs(coef).mean(axis=0) + else: + raise ValueError( + "Model must expose 'feature_importances_' or 'coef_' for importance extraction." + ) + + score = pd.Series(np.abs(vals), index=feature_names, dtype=float).sort_values(ascending=False) + return score.to_frame(name="feature_importance") diff --git a/src/factorlab/learning/search.py b/src/factorlab/learning/search.py new file mode 100644 index 0000000..6a6b522 --- /dev/null +++ b/src/factorlab/learning/search.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +from copy import deepcopy +from itertools import product +from typing import Any, Callable, Mapping, Optional, Sequence, Union + +import pandas as pd + +from factorlab.core.pipeline import Pipeline +from factorlab.core.walk_forward_runner import WalkForwardRunner +from factorlab.learning.splitters import BasePanelSplit +from factorlab.targets.forward import ForwardTargetSpec + + +def parameter_grid(parameters: Mapping[str, Sequence[Any]]) -> list[dict[str, Any]]: + """ + Expand a parameter grid into a list of parameter combinations. + + Parameters are specified using sklearn-style keys: + ``__``. + """ + if not parameters: + return [{}] + keys = list(parameters.keys()) + values = [list(parameters[k]) for k in keys] + return [dict(zip(keys, combo)) for combo in product(*values)] + + +def set_pipeline_params(pipeline: Pipeline, params: Mapping[str, Any]) -> None: + """ + Apply sklearn-style step parameters in-place to a Pipeline copy. + """ + steps = dict(pipeline.steps) + for key, value in params.items(): + if "__" not in key: + raise ValueError(f"Invalid param key '{key}'. Use '__'.") + step_name, attr = key.split("__", 1) + if step_name not in steps: + raise ValueError(f"Unknown pipeline step '{step_name}' in param key '{key}'.") + setattr(steps[step_name], attr, value) + + +class WalkForwardGridSearch: + """ + Pipeline-native parameter search using walk-forward evaluation. + """ + + def __init__( + self, + pipeline: Pipeline, + splitter: BasePanelSplit, + param_grid: Union[Mapping[str, Sequence[Any]], Sequence[Mapping[str, Any]]], + scorer: Callable[[pd.DataFrame], float], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + target_spec: Optional[ForwardTargetSpec] = None, + strict_temporal: bool = True, + date_level: int = 0, + show_progress: bool = False, + ): + self.pipeline = pipeline + self.splitter = splitter + self.param_grid = ( + parameter_grid(param_grid) + if isinstance(param_grid, Mapping) + else [dict(p) for p in param_grid] + ) + self.scorer = scorer + self.y = y + self.target_spec = target_spec + self.strict_temporal = strict_temporal + self.date_level = date_level + self.show_progress = show_progress + self.results_: Optional[pd.DataFrame] = None + + def run(self, X: pd.DataFrame) -> pd.DataFrame: + if not isinstance(X, pd.DataFrame): + raise TypeError("WalkForwardGridSearch expects X as a pandas DataFrame.") + + records: list[dict[str, Any]] = [] + for params in self.param_grid: + pipe = deepcopy(self.pipeline) + set_pipeline_params(pipe, params) + runner = WalkForwardRunner( + pipeline=pipe, + splitter=self.splitter, + y=self.y, + target_spec=self.target_spec, + strict_temporal=self.strict_temporal, + date_level=self.date_level, + show_progress=self.show_progress, + ) + out = runner.run(X) + score = float(self.scorer(out)) + record = dict(params) + record["score"] = score + record["n_folds"] = len(runner.fold_info) + records.append(record) + + results = pd.DataFrame(records).sort_values("score", ascending=False).reset_index(drop=True) + self.results_ = results + return results diff --git a/src/factorlab/learning/selectors.py b/src/factorlab/learning/selectors.py new file mode 100644 index 0000000..17237ff --- /dev/null +++ b/src/factorlab/learning/selectors.py @@ -0,0 +1,407 @@ +from __future__ import annotations + +from copy import deepcopy +from itertools import combinations +from typing import Any, List, Optional, Sequence, Union + +import numpy as np +import pandas as pd +from scipy.stats import chi2_contingency, contingency +from sklearn.base import clone +from sklearn.feature_selection import mutual_info_classif, mutual_info_regression +from sklearn.feature_selection import RFE +from sklearn.feature_selection import SequentialFeatureSelector +from sklearn.linear_model import LinearRegression + +from factorlab.core.base_transform import BaseTransform +from factorlab.learning.model_registry import build_regressor, extract_feature_importance +from factorlab.learning.utils import resolve_feature_columns +from factorlab.core.utils.utils import to_dataframe + + +class FeatureSelector(BaseTransform): + """ + Pipeline-native feature selection transform. + + Fit learns ranked/selected features from a training fold; transform can optionally + drop unselected features from the output DataFrame. + """ + + _METHOD_ALIASES = { + "spearman_rank": "spearman", + "kendall_tau": "kendall", + "mutual_info": "mutual_info_regression", + } + _METHODS = { + "spearman", + "kendall", + "cramer_v", + "tschuprow", + "pearson_cc", + "chi2", + "mutual_info_regression", + "mutual_info_classification", + "variance", + "model_importance", + "lars", + "lasso", + "ridge", + "elastic_net", + "random_forest", + "xgboost", + "catboost", + "mrmr", + "mifs", + "mrmr_mifs", + "spearman_mrmr", + "rfe", + "stepwise", + "forward", + "backward", + "exhaustive", + } + + def __init__( + self, + method: str = "spearman", + feature_cols: Optional[Sequence[str]] = None, + target_col: Optional[str] = None, + n_features: Optional[int] = None, + score_threshold: Optional[float] = None, + model: Optional[Any] = None, + drop_unselected: bool = True, + exclude_cols: Optional[Sequence[str]] = None, + absolute_scores: bool = True, + feature_bins: int = 5, + target_bins: int = 3, + mifs_beta: float = 0.5, + random_state: int = 42, + method_kwargs: Optional[dict[str, Any]] = None, + ): + canonical_method = self._METHOD_ALIASES.get(method, method) + super().__init__( + name=f"FeatureSelector({canonical_method})", + description="Select top features using fold-local training data.", + ) + if canonical_method not in self._METHODS: + raise ValueError(f"method must be one of {sorted(self._METHODS)}.") + if n_features is not None and n_features < 1: + raise ValueError("n_features must be >= 1 when provided.") + if feature_bins < 2: + raise ValueError("feature_bins must be >= 2.") + if target_bins < 2: + raise ValueError("target_bins must be >= 2.") + if canonical_method == "model_importance" and model is None: + raise ValueError("model must be provided when method='model_importance'.") + + self.method = canonical_method + self.feature_cols = list(feature_cols) if feature_cols is not None else None + self.target_col = target_col + self.n_features = n_features + self.score_threshold = score_threshold + self.model = model + self.drop_unselected = drop_unselected + self.exclude_cols = list(exclude_cols) if exclude_cols is not None else [] + self.absolute_scores = absolute_scores + self.feature_bins = feature_bins + self.target_bins = target_bins + self.mifs_beta = float(mifs_beta) + self.random_state = int(random_state) + self.method_kwargs = dict(method_kwargs or {}) + + self.resolved_feature_cols_: list[str] = [] + self.selected_features_: list[str] = [] + self.feature_scores_: pd.DataFrame = pd.DataFrame() + + @property + def inputs(self) -> List[str]: + required = list(self.feature_cols) if self.feature_cols is not None else [] + if self.target_col is not None: + required.append(self.target_col) + return required + + @staticmethod + def _safe_clone(model: Any) -> Any: + try: + return clone(model) + except Exception: + return deepcopy(model) + + def _resolve_target( + self, + X_df: pd.DataFrame, + y: Optional[Union[pd.Series, pd.DataFrame]], + ) -> pd.Series: + if y is not None: + y_df = to_dataframe(y).copy(deep=True) + if y_df.shape[1] != 1: + raise ValueError("y must contain exactly one target column.") + y_series = y_df.iloc[:, 0].reindex(X_df.index) + return y_series + + if self.target_col is None: + raise ValueError("target_col must be provided when y is not supplied.") + if self.target_col not in X_df.columns: + raise ValueError(f"target_col '{self.target_col}' not found in input columns.") + return X_df[self.target_col] + + @staticmethod + def _quantize_series(series: pd.Series, bins: int) -> pd.Series: + valid = series.dropna() + if valid.nunique() <= 1: + return pd.Series(np.nan, index=series.index) + q = min(int(bins), int(valid.nunique())) + ranked = valid.rank(method="first") + quant = pd.qcut(ranked, q=q, labels=False, duplicates="drop") + out = pd.Series(np.nan, index=series.index, dtype=float) + out.loc[valid.index] = quant.astype(float) + return out + + def _categorical_association_scores(self, X_feat: pd.DataFrame, y: pd.Series) -> pd.Series: + yq = self._quantize_series(y, bins=self.target_bins) + scores: dict[str, float] = {} + for col in X_feat.columns: + xq = self._quantize_series(X_feat[col], bins=self.feature_bins) + valid = xq.notna() & yq.notna() + if int(valid.sum()) == 0: + scores[col] = np.nan + continue + table = pd.crosstab(xq.loc[valid], yq.loc[valid]) + if table.empty: + scores[col] = np.nan + continue + if self.method == "cramer_v": + scores[col] = float(contingency.association(table, method="cramer")) + elif self.method == "tschuprow": + scores[col] = float(contingency.association(table, method="tschuprow")) + elif self.method == "pearson_cc": + scores[col] = float(contingency.association(table, method="pearson")) + elif self.method == "chi2": + scores[col] = float(chi2_contingency(table)[0]) + else: + raise ValueError(f"Unsupported association method '{self.method}'.") + return pd.Series(scores, dtype=float) + + def _build_model_for_method(self, X_feat: pd.DataFrame) -> Any: + if self.method in { + "model_importance", + "lars", + "lasso", + "ridge", + "elastic_net", + "random_forest", + "xgboost", + "catboost", + }: + custom_model = self.model if self.method == "model_importance" else None + method = "linear_regression" if self.method == "model_importance" else self.method + return build_regressor( + method=method, + n_features=int(self.n_features or X_feat.shape[1]), + random_state=self.random_state, + custom_model=custom_model, + method_kwargs=self.method_kwargs, + ) + return None + + @staticmethod + def _safe_abs_spearman_corr(a: pd.Series, b: pd.Series) -> float: + corr = a.corr(b, method="spearman") + if pd.isna(corr): + return 0.0 + return float(abs(corr)) + + def _greedy_redundancy_selection( + self, + X_feat: pd.DataFrame, + y: pd.Series, + mode: str, + ) -> pd.Series: + if self.n_features is None: + top_k = min(10, X_feat.shape[1]) + else: + top_k = min(int(self.n_features), int(X_feat.shape[1])) + if top_k < 1: + raise ValueError("n_features must resolve to >= 1 for redundancy-aware methods.") + + if mode == "spearman_mrmr": + relevance = X_feat.apply(lambda col: self._safe_abs_spearman_corr(col, y)) + redundancy = X_feat.corr(method="spearman").abs().fillna(0.0) + else: + valid = X_feat.notna().all(axis=1) & y.notna() + if int(valid.sum()) == 0: + raise ValueError("No valid non-NaN rows available for relevance scoring.") + Xv = X_feat.loc[valid] + yv = y.loc[valid] + mi = mutual_info_regression(Xv, yv, random_state=self.random_state) + relevance = pd.Series(mi, index=X_feat.columns).fillna(0.0) + redundancy = X_feat.corr(method="pearson").abs().fillna(0.0) + + remaining = list(X_feat.columns) + selected: list[str] = [] + scores: dict[str, float] = {} + + while remaining and len(selected) < top_k: + if not selected: + best = max(remaining, key=lambda c: float(relevance.get(c, 0.0))) + score = float(relevance.get(best, 0.0)) + else: + candidate_scores: dict[str, float] = {} + for col in remaining: + rel = float(relevance.get(col, 0.0)) + red_vals = [float(redundancy.loc[col, s]) for s in selected if s in redundancy.columns] + red_mean = float(np.mean(red_vals)) if red_vals else 0.0 + red_sum = float(np.sum(red_vals)) if red_vals else 0.0 + + if mode in {"mrmr", "spearman_mrmr"}: + candidate_scores[col] = rel - red_mean + elif mode == "mifs": + candidate_scores[col] = rel - self.mifs_beta * red_sum + elif mode == "mrmr_mifs": + candidate_scores[col] = rel - 0.5 * (red_mean + self.mifs_beta * red_sum) + else: + raise ValueError(f"Unknown redundancy mode '{mode}'.") + best = max(candidate_scores, key=candidate_scores.get) + score = float(candidate_scores[best]) + + selected.append(best) + remaining.remove(best) + scores[best] = score + + return pd.Series(scores, dtype=float).sort_values(ascending=False) + + def _score_features( + self, + X_feat: pd.DataFrame, + y: Optional[pd.Series], + ) -> pd.Series: + if self.method == "variance": + scores = X_feat.var(ddof=0) + return scores.fillna(0.0) + + if y is None: + raise ValueError(f"Method '{self.method}' requires a target series.") + + if self.method in {"spearman", "kendall"}: + corr_method = "spearman" if self.method == "spearman" else "kendall" + scores = X_feat.apply(lambda col: col.corr(y, method=corr_method)) + return scores.fillna(0.0) + + if self.method in {"cramer_v", "tschuprow", "pearson_cc", "chi2"}: + return self._categorical_association_scores(X_feat=X_feat, y=y).fillna(0.0) + + if self.method in {"mrmr", "mifs", "mrmr_mifs", "spearman_mrmr"}: + return self._greedy_redundancy_selection(X_feat=X_feat, y=y, mode=self.method).fillna(0.0) + + valid = X_feat.notna().all(axis=1) & y.notna() + if int(valid.sum()) == 0: + raise ValueError("No valid non-NaN rows available for feature scoring.") + + Xv = X_feat.loc[valid] + yv = y.loc[valid] + + if self.method == "mutual_info_regression": + vals = mutual_info_regression(Xv, yv, random_state=self.random_state) + return pd.Series(vals, index=X_feat.columns, dtype=float) + + if self.method == "mutual_info_classification": + y_codes = pd.Series(yv, index=Xv.index).astype("category").cat.codes.to_numpy() + vals = mutual_info_classif(Xv, y_codes, random_state=self.random_state) + return pd.Series(vals, index=X_feat.columns, dtype=float) + + if self.method == "rfe": + estimator = self._safe_clone(self.model if self.model is not None else LinearRegression()) + n_select = self.n_features if self.n_features is not None else max(1, X_feat.shape[1] // 2) + step = self.method_kwargs.get("step", 1) + selector = RFE(estimator=estimator, n_features_to_select=int(n_select), step=step) + selector.fit(Xv, yv) + ranking = pd.Series(selector.ranking_, index=X_feat.columns, dtype=float) + # Lower ranking is better; convert to descending score where selected features receive highest values. + return (ranking.max() + 1 - ranking).astype(float) + + if self.method in {"forward", "backward", "stepwise"}: + estimator = self._safe_clone(self.model if self.model is not None else LinearRegression()) + n_select = self.n_features if self.n_features is not None else max(1, X_feat.shape[1] // 2) + direction = "backward" if self.method == "backward" else "forward" + selector = SequentialFeatureSelector( + estimator=estimator, + n_features_to_select=int(n_select), + direction=direction, + ) + selector.fit(Xv, yv) + support = pd.Series(selector.get_support(), index=X_feat.columns) + return support.astype(float) + + if self.method == "exhaustive": + estimator = self._safe_clone(self.model if self.model is not None else LinearRegression()) + n_select = self.n_features if self.n_features is not None else max(1, X_feat.shape[1] // 2) + n_select = min(int(n_select), int(X_feat.shape[1])) + best_combo: Optional[tuple[str, ...]] = None + best_score = -np.inf + for combo in combinations(list(X_feat.columns), n_select): + estimator_i = self._safe_clone(estimator) + estimator_i.fit(Xv.loc[:, combo], yv) + score = float(estimator_i.score(Xv.loc[:, combo], yv)) + if score > best_score: + best_score = score + best_combo = combo + if best_combo is None: + raise ValueError("Exhaustive selection could not identify a valid feature subset.") + support = pd.Series(0.0, index=X_feat.columns, dtype=float) + support.loc[list(best_combo)] = 1.0 + return support + + model = self._build_model_for_method(X_feat=X_feat) + if model is None: + raise ValueError(f"Unsupported feature-selection method '{self.method}'.") + model.fit(Xv, yv) + return extract_feature_importance(model, list(X_feat.columns))["feature_importance"] + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "FeatureSelector": + df = to_dataframe(X).copy(deep=True) + self.resolved_feature_cols_ = resolve_feature_columns( + df=df, + feature_cols=self.feature_cols, + exclude_cols=self.exclude_cols + [self.target_col], + require_numeric=True, + ) + X_feat = df[self.resolved_feature_cols_] + + target = None if self.method == "variance" else self._resolve_target(df, y=y) + scores = self._score_features(X_feat, y=target) + if self.absolute_scores: + scores = scores.abs() + scores = scores.sort_values(ascending=False) + + if self.score_threshold is not None: + scores = scores.loc[scores >= float(self.score_threshold)] + if self.n_features is not None: + scores = scores.head(int(self.n_features)) + if scores.empty: + raise ValueError("Feature selection produced zero selected features.") + + self.selected_features_ = list(scores.index) + self.feature_scores_ = scores.to_frame(name="score") + self._is_fitted = True + return self + + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + if not self._is_fitted: + raise RuntimeError(f"Transform '{self.name}' must be fitted before calling transform().") + + df = to_dataframe(X).copy(deep=True) + missing = set(self.selected_features_) - set(df.columns) + if missing: + raise ValueError(f"Input is missing selected features required by transform: {missing}") + + if not self.drop_unselected: + return df + + drop_cols = [c for c in self.resolved_feature_cols_ if c not in self.selected_features_] + if drop_cols: + df = df.drop(columns=drop_cols, errors="ignore") + return df diff --git a/src/factorlab/learning/sklearn_wrapper.py b/src/factorlab/learning/sklearn_wrapper.py index 3bf6334..5409c4a 100644 --- a/src/factorlab/learning/sklearn_wrapper.py +++ b/src/factorlab/learning/sklearn_wrapper.py @@ -1,12 +1,13 @@ import pandas as pd -from typing import Union, List +from typing import Optional, Union, List from sklearn.base import BaseEstimator -from factorlab.core.base_transform import BaseTransform +from factorlab.learning.base import SupervisedLearner +from factorlab.learning.utils import resolve_feature_columns from factorlab.core.utils.utils import to_dataframe -class SKLearnWrapper(BaseTransform): +class SKLearnWrapper(SupervisedLearner): """ Wraps a scikit-learn compatible model (estimator) to integrate it as a prediction step within the FactorLab Pipeline. @@ -18,16 +19,19 @@ class SKLearnWrapper(BaseTransform): def __init__(self, model: BaseEstimator, - feature_cols: List[str], - target_col: str, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, output_col: str = 'forecast_score', - prediction_method: str = 'predict'): + prediction_method: str = 'predict', + exclude_cols: Optional[List[str]] = None): """ Args: model: An instantiated scikit-learn estimator (e.g., RandomForestClassifier()). feature_cols: List of column names to use as features (X). These must be present in the input DataFrame passed to transform(). - target_col: Name of the column to use as the target (y). + target_col: Optional name of the column to use as the target (y) when + `fit(..., y=None)` is used. If `fit(..., y=...)` is provided, + this can be omitted. output_col: Name of the new column that will hold the prediction score/signal. prediction_method: 'predict' (for classification/regression output) or 'predict_proba' (for probability scores). @@ -43,6 +47,8 @@ def __init__(self, self.target_col = target_col self.output_col = output_col self.prediction_method = prediction_method + self.exclude_cols = exclude_cols or [] + self._resolved_feature_cols: List[str] = [] # Input validation for the underlying model's capability if not hasattr(self.model, 'fit') or not hasattr(self.model, prediction_method): @@ -50,46 +56,78 @@ def __init__(self, @property def inputs(self) -> List[str]: - # The fit operation requires both features and the target. - # The transform operation only requires features. - return self.feature_cols + [self.target_col] - - def fit(self, X: Union[pd.Series, pd.DataFrame], y=None) -> 'SKLearnWrapper': + required = list(self.feature_cols) if self.feature_cols is not None else [] + if self.target_col is not None: + required.append(self.target_col) + return required + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> 'SKLearnWrapper': """Fits the underlying ML model using the provided training data.""" df_input = to_dataframe(X).copy(deep=True) - self.validate_inputs(df_input) - - X_train = df_input[self.feature_cols] - # Extract the target column for fitting - y_train = df_input[self.target_col] + self._resolved_feature_cols = resolve_feature_columns( + df=df_input, + feature_cols=self.feature_cols, + exclude_cols=list(self.exclude_cols) + [self.target_col, self.output_col], + require_numeric=True, + ) + X_train = df_input[self._resolved_feature_cols] + if y is None: + if self.target_col is None: + raise ValueError("target_col must be provided when fit(..., y=None).") + if self.target_col not in df_input.columns: + raise ValueError(f"target_col '{self.target_col}' not found in input columns.") + y_train = df_input[self.target_col] + else: + y_df = to_dataframe(y).copy(deep=True) + if y_df.shape[1] != 1: + raise ValueError("y must contain exactly one target column.") + y_series = y_df.iloc[:, 0] + y_train = y_series.reindex(X_train.index) + + valid = X_train.notna().all(axis=1) & y_train.notna() + if int(valid.sum()) == 0: + raise ValueError("No valid non-NaN rows available for model fit.") + X_train = X_train.loc[valid] + y_train = y_train.loc[valid] # Fit the model self.model.fit(X_train, y_train) self._is_fitted = True - print(f"SKLearnWrapper: Successfully fitted {self.model.__class__.__name__}.") return self def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: """Applies the underlying ML model to generate a forecast score column.""" if not self._is_fitted: raise RuntimeError(f"Transform '{self.name}' must be fitted before calling transform()") + if len(self._resolved_feature_cols) == 0: + raise RuntimeError("No resolved feature columns found. Fit must resolve feature columns first.") df = to_dataframe(X).copy(deep=True) - # We only need the features for transformation, but validate all inputs for safety - self.validate_inputs(df) + missing_features = set(self._resolved_feature_cols) - set(df.columns) + if missing_features: + raise ValueError(f"Missing feature columns for transform: {missing_features}") - X_test = df[self.feature_cols] + X_test = df[self._resolved_feature_cols] + valid = X_test.notna().all(axis=1) # Select the appropriate prediction method - if self.prediction_method == 'predict_proba' and hasattr(self.model, 'predict_proba'): - # Generally, the probability of the positive class is index 1 - scores = self.model.predict_proba(X_test)[:, 1] - elif hasattr(self.model, self.prediction_method): - # For 'predict' or other custom methods - scores = getattr(self.model, self.prediction_method)(X_test) - else: - raise AttributeError( - f"Model {self.model.__class__.__name__} does not have method '{self.prediction_method}'") + scores = pd.Series(index=df.index, dtype=float) + if valid.any(): + X_eval = X_test.loc[valid] + if self.prediction_method == 'predict_proba' and hasattr(self.model, 'predict_proba'): + # Generally, the probability of the positive class is index 1 + pred = self.model.predict_proba(X_eval)[:, 1] + elif hasattr(self.model, self.prediction_method): + # For 'predict' or other custom methods + pred = getattr(self.model, self.prediction_method)(X_eval) + else: + raise AttributeError( + f"Model {self.model.__class__.__name__} does not have method '{self.prediction_method}'") + scores.loc[valid] = pred # Add the resulting score column to the DataFrame df[self.output_col] = scores diff --git a/src/factorlab/learning/splitters.py b/src/factorlab/learning/splitters.py new file mode 100644 index 0000000..4cf81da --- /dev/null +++ b/src/factorlab/learning/splitters.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Iterator, Optional, Union + +import numpy as np +import pandas as pd +from sklearn.model_selection import BaseCrossValidator + +from factorlab.learning.utils import ( + apply_purge, + extract_dates, + mask_from_dates, + unique_sorted_dates, + validate_panel_xy, +) + + +class BasePanelSplit(BaseCrossValidator, ABC): + """ + Base panel splitter for FactorLab indices of shape (date, ticker). + """ + + def __init__( + self, + date_level: int = 0, + lookahead: int = 0, + embargo: int = 0, + drop_nas: bool = True, + ): + if lookahead < 0: + raise ValueError("lookahead must be >= 0.") + if embargo < 0: + raise ValueError("embargo must be >= 0.") + + self.date_level = date_level + self.lookahead = lookahead + self.embargo = embargo + self.drop_nas = drop_nas + + def _prepare_xy( + self, + X: pd.DataFrame, + y: Optional[Union[pd.Series, pd.DataFrame]], + ) -> tuple[pd.DataFrame, np.ndarray, pd.DatetimeIndex, pd.DatetimeIndex]: + validate_panel_xy(X=X, y=y, date_level=self.date_level, require_multiindex=True) + + if y is None: + Xy = X.copy(deep=True) + else: + y_df = y.to_frame() if isinstance(y, pd.Series) else y + Xy = pd.concat([X, y_df], axis=1) + if self.drop_nas: + Xy = Xy.dropna() + else: + Xy = Xy.dropna(subset=y_df.columns) + Xy = Xy.dropna(subset=X.columns, how="all") + + row_positions = X.index.get_indexer(Xy.index) + if np.any(row_positions < 0): + raise RuntimeError("Failed to map filtered split rows back to original X index positions.") + + dates = extract_dates(Xy.index, date_level=self.date_level) + unique_dates = unique_sorted_dates(Xy.index, date_level=self.date_level) + return Xy, row_positions, dates, unique_dates + + @abstractmethod + def split( + self, + X: pd.DataFrame, + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + groups=None, + ) -> Iterator[tuple[np.ndarray, np.ndarray]]: + raise NotImplementedError + + def get_n_splits(self, X=None, y=None, groups=None): + if X is None: + raise ValueError("X is required to compute n_splits.") + return len(list(self.split(X=X, y=y, groups=groups))) + + +class ExpandingKFoldPanelSplit(BasePanelSplit): + """ + Time-respecting K-Fold splitter for panel data. + """ + + def __init__( + self, + n_splits: int = 5, + **kwargs, + ): + super().__init__(**kwargs) + if n_splits < 1: + raise ValueError("n_splits must be >= 1.") + self.n_splits = n_splits + + def split(self, X, y=None, groups=None): + if groups is not None: + raise ValueError("groups is not supported by this splitter.") + _, row_positions, dates, unique_dates = self._prepare_xy(X=X, y=y) + + intervals = np.array_split(unique_dates, self.n_splits + 1) + for i in range(self.n_splits): + train_dates = np.concatenate(intervals[: i + 1]) + test_dates = intervals[i + 1] + if len(test_dates) == 0: + continue + train_dates = apply_purge( + train_dates=pd.DatetimeIndex(train_dates), + unique_dates=unique_dates, + test_start=pd.Timestamp(test_dates[0]), + lookahead=self.lookahead, + embargo=self.embargo, + ) + if len(train_dates) == 0: + continue + + train_idx = row_positions[mask_from_dates(dates, train_dates)] + test_idx = row_positions[mask_from_dates(dates, pd.DatetimeIndex(test_dates))] + if len(test_idx) == 0: + continue + yield train_idx, test_idx + + +class RollingKFoldPanelSplit(BasePanelSplit): + """ + K-Fold splitter over time intervals. + """ + + def __init__(self, n_splits: int = 5, **kwargs): + super().__init__(**kwargs) + if n_splits < 2: + raise ValueError("n_splits must be >= 2.") + self.n_splits = n_splits + + def split(self, X, y=None, groups=None): + if groups is not None: + raise ValueError("groups is not supported by this splitter.") + _, row_positions, dates, unique_dates = self._prepare_xy(X=X, y=y) + intervals = np.array_split(unique_dates, self.n_splits) + + for i in range(self.n_splits): + test_dates = pd.DatetimeIndex(intervals[i]) + if len(test_dates) == 0: + continue + + train_dates = pd.DatetimeIndex(np.concatenate(intervals[:i])) + train_dates = apply_purge( + train_dates=train_dates, + unique_dates=unique_dates, + test_start=test_dates.min(), + lookahead=self.lookahead, + embargo=self.embargo, + ) + if len(train_dates) == 0: + continue + + train_idx = row_positions[mask_from_dates(dates, train_dates)] + test_idx = row_positions[mask_from_dates(dates, test_dates)] + if len(test_idx) == 0: + continue + yield train_idx, test_idx + + +class ExpandingIncrementPanelSplit(BasePanelSplit): + """ + Walk-forward splitter with fixed train expansion intervals and fixed test size. + """ + + def __init__( + self, + train_intervals: int = 21, + test_size: int = 21, + min_train_periods: int = 252, + max_train_periods: Optional[int] = None, + **kwargs, + ): + super().__init__(**kwargs) + if train_intervals < 1: + raise ValueError("train_intervals must be >= 1.") + if test_size < 1: + raise ValueError("test_size must be >= 1.") + if min_train_periods < 1: + raise ValueError("min_train_periods must be >= 1.") + if max_train_periods is not None and max_train_periods < 1: + raise ValueError("max_train_periods must be >= 1 when provided.") + + self.train_intervals = train_intervals + self.test_size = test_size + self.min_train_periods = min_train_periods + self.max_train_periods = max_train_periods + + def split(self, X, y=None, groups=None): + if groups is not None: + raise ValueError("groups is not supported by this splitter.") + _, row_positions, dates, unique_dates = self._prepare_xy(X=X, y=y) + n_dates = len(unique_dates) + + train_end = self.min_train_periods + while train_end + self.lookahead + self.embargo < n_dates: + test_start = train_end + self.lookahead + self.embargo + test_end = min(test_start + self.test_size, n_dates) + if test_start >= test_end: + break + + if self.max_train_periods is None: + train_start = 0 + else: + train_start = max(0, train_end - self.max_train_periods) + + train_dates = unique_dates[train_start:train_end] + test_dates = unique_dates[test_start:test_end] + if len(train_dates) == 0 or len(test_dates) == 0: + break + + train_idx = row_positions[mask_from_dates(dates, train_dates)] + test_idx = row_positions[mask_from_dates(dates, test_dates)] + if len(test_idx) > 0: + yield train_idx, test_idx + + train_end += self.train_intervals + + +class ExpandingFrequencyPanelSplit(BasePanelSplit): + """ + Walk-forward splitter driven by calendar frequencies. + """ + + _FREQ_MAP = { + "D": pd.DateOffset(days=1), + "W": pd.DateOffset(weeks=1), + "M": pd.DateOffset(months=1), + "Q": pd.DateOffset(months=3), + "Y": pd.DateOffset(years=1), + } + + def __init__( + self, + expansion_freq: str = "M", + test_freq: str = "M", + min_train_periods: int = 252, + max_train_periods: Optional[int] = None, + **kwargs, + ): + super().__init__(**kwargs) + if expansion_freq not in self._FREQ_MAP: + raise ValueError("expansion_freq must be one of {'D','W','M','Q','Y'}.") + if test_freq not in self._FREQ_MAP: + raise ValueError("test_freq must be one of {'D','W','M','Q','Y'}.") + if min_train_periods < 1: + raise ValueError("min_train_periods must be >= 1.") + if max_train_periods is not None and max_train_periods < 1: + raise ValueError("max_train_periods must be >= 1 when provided.") + + self.expansion_freq = expansion_freq + self.test_freq = test_freq + self.min_train_periods = min_train_periods + self.max_train_periods = max_train_periods + + def split(self, X, y=None, groups=None): + if groups is not None: + raise ValueError("groups is not supported by this splitter.") + _, row_positions, dates, unique_dates = self._prepare_xy(X=X, y=y) + + exp_offset = self._FREQ_MAP[self.expansion_freq] + test_offset = self._FREQ_MAP[self.test_freq] + + train_end = self.min_train_periods + n_dates = len(unique_dates) + while train_end + self.lookahead + self.embargo < n_dates: + if self.max_train_periods is None: + train_start = 0 + else: + train_start = max(0, train_end - self.max_train_periods) + + train_dates = unique_dates[train_start:train_end] + if len(train_dates) == 0: + break + + test_start_idx = train_end + self.lookahead + self.embargo + if test_start_idx >= n_dates: + break + test_start_date = unique_dates[test_start_idx] + test_end_date = test_start_date + test_offset + test_dates = unique_dates[(unique_dates >= test_start_date) & (unique_dates < test_end_date)] + if len(test_dates) == 0: + test_dates = unique_dates[test_start_idx: test_start_idx + 1] + if len(test_dates) == 0: + break + + train_idx = row_positions[mask_from_dates(dates, train_dates)] + test_idx = row_positions[mask_from_dates(dates, test_dates)] + if len(test_idx) > 0: + yield train_idx, test_idx + + train_last_date = unique_dates[train_end - 1] + next_cutoff_date = train_last_date + exp_offset + next_train_end = int(unique_dates.searchsorted(next_cutoff_date, side="right")) + if next_train_end <= train_end: + next_train_end = train_end + 1 + if next_train_end > n_dates: + break + train_end = next_train_end + + +__all__ = [ + "BasePanelSplit", + "ExpandingKFoldPanelSplit", + "RollingKFoldPanelSplit", + "ExpandingIncrementPanelSplit", + "ExpandingFrequencyPanelSplit", +] diff --git a/src/factorlab/learning/supervised/__init__.py b/src/factorlab/learning/supervised/__init__.py index e69de29..b38d169 100644 --- a/src/factorlab/learning/supervised/__init__.py +++ b/src/factorlab/learning/supervised/__init__.py @@ -0,0 +1,44 @@ +from factorlab.learning.supervised.learners import ( + CatBoostClassifierLearner, + CatBoostRegressorLearner, + ClassificationLearner, + ElasticNetLearner, + LassoLearner, + LinearRegressionLearner, + LogisticRegressionCVLearner, + LogisticRegressionLearner, + MLPClassifierLearner, + MLPRegressorLearner, + RandomForestClassifierLearner, + RandomForestRegressorLearner, + RegressionLearner, + RidgeLearner, + SupervisedPCALearner, + TorchRegressorLearner, + XGBoostClassifierLearner, + XGBoostRegressorLearner, +) +from factorlab.learning.supervised.walk_forward_learner import FoldInfo, WalkForwardLearner + +__all__ = [ + "WalkForwardLearner", + "FoldInfo", + "RegressionLearner", + "ClassificationLearner", + "LinearRegressionLearner", + "RidgeLearner", + "LassoLearner", + "ElasticNetLearner", + "RandomForestRegressorLearner", + "CatBoostRegressorLearner", + "RandomForestClassifierLearner", + "LogisticRegressionLearner", + "LogisticRegressionCVLearner", + "XGBoostClassifierLearner", + "CatBoostClassifierLearner", + "XGBoostRegressorLearner", + "MLPRegressorLearner", + "MLPClassifierLearner", + "TorchRegressorLearner", + "SupervisedPCALearner", +] diff --git a/src/factorlab/learning/supervised/learners.py b/src/factorlab/learning/supervised/learners.py new file mode 100644 index 0000000..0ea36b4 --- /dev/null +++ b/src/factorlab/learning/supervised/learners.py @@ -0,0 +1,673 @@ +from __future__ import annotations + +from copy import deepcopy +from typing import Any, List, Optional, Sequence, Union + +import numpy as np +import pandas as pd +from sklearn.base import clone +from sklearn.decomposition import PCA +from sklearn.linear_model import LinearRegression +from sklearn.neural_network import MLPClassifier, MLPRegressor + +from factorlab.learning.base import SupervisedLearner +from factorlab.learning.model_registry import build_classifier, build_regressor +from factorlab.learning.selectors import FeatureSelector +from factorlab.learning.sklearn_wrapper import SKLearnWrapper +from factorlab.learning.utils import resolve_feature_columns +from factorlab.core.utils.utils import to_dataframe + + +class RegressionLearner(SKLearnWrapper): + """ + Pipeline-native regression learner built from the shared model registry. + + Use ``method`` for full model coverage (linear/lasso/lars/ridge/elastic-net/ + random-forest/xgboost/catboost). + """ + + def __init__( + self, + method: str = "linear_regression", + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[List[str]] = None, + model: Optional[Any] = None, + n_features: Optional[int] = None, + random_state: int = 42, + prediction_method: str = "predict", + **model_kwargs: Any, + ): + estimator = build_regressor( + method=method, + n_features=n_features, + random_state=random_state, + custom_model=model, + method_kwargs=model_kwargs, + ) + self.method = method + self.model_kwargs = dict(model_kwargs) + + super().__init__( + model=estimator, + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + prediction_method=prediction_method, + exclude_cols=exclude_cols, + ) + + +class ClassificationLearner(SKLearnWrapper): + """ + Pipeline-native classification learner built from the shared model registry. + + Use ``method`` for full model coverage (logistic/logistic-cv/random-forest/ + xgboost/catboost). + """ + + def __init__( + self, + method: str = "logistic_regression", + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast_proba", + exclude_cols: Optional[List[str]] = None, + model: Optional[Any] = None, + random_state: int = 42, + prediction_method: str = "predict_proba", + **model_kwargs: Any, + ): + estimator = build_classifier( + method=method, + random_state=random_state, + custom_model=model, + method_kwargs=model_kwargs, + ) + self.method = method + self.model_kwargs = dict(model_kwargs) + + super().__init__( + model=estimator, + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + prediction_method=prediction_method, + exclude_cols=exclude_cols, + ) + + +class LinearRegressionLearner(RegressionLearner): + def __init__( + self, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + method="linear_regression", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + **model_kwargs, + ) + + +class RidgeLearner(RegressionLearner): + def __init__( + self, + alpha: float = 1.0, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + method="ridge", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + alpha=alpha, + **model_kwargs, + ) + + +class LassoLearner(RegressionLearner): + def __init__( + self, + alpha: float = 0.05, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + method="lasso", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + alpha=alpha, + **model_kwargs, + ) + + +class ElasticNetLearner(RegressionLearner): + def __init__( + self, + alpha: float = 0.05, + l1_ratio: float = 0.5, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + method="elastic_net", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + alpha=alpha, + l1_ratio=l1_ratio, + **model_kwargs, + ) + + +class RandomForestRegressorLearner(RegressionLearner): + def __init__( + self, + n_estimators: int = 200, + max_depth: Optional[int] = None, + random_state: int = 42, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + method="random_forest", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + random_state=random_state, + n_estimators=n_estimators, + max_depth=max_depth, + **model_kwargs, + ) + + +class XGBoostRegressorLearner(RegressionLearner): + def __init__( + self, + n_estimators: int = 300, + max_depth: int = 6, + random_state: int = 42, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + method="xgboost", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + random_state=random_state, + n_estimators=n_estimators, + max_depth=max_depth, + **model_kwargs, + ) + + +class CatBoostRegressorLearner(RegressionLearner): + def __init__( + self, + iterations: int = 300, + depth: int = 6, + learning_rate: float = 0.05, + random_state: int = 42, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + method="catboost", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + random_state=random_state, + iterations=iterations, + depth=depth, + learning_rate=learning_rate, + **model_kwargs, + ) + + +class LogisticRegressionLearner(ClassificationLearner): + def __init__( + self, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast_proba", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + method="logistic_regression", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + prediction_method="predict_proba", + **model_kwargs, + ) + + +class LogisticRegressionCVLearner(ClassificationLearner): + def __init__( + self, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast_proba", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + method="logistic_regression_cv", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + prediction_method="predict_proba", + **model_kwargs, + ) + + +class RandomForestClassifierLearner(ClassificationLearner): + def __init__( + self, + n_estimators: int = 200, + max_depth: Optional[int] = None, + random_state: int = 42, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast_proba", + exclude_cols: Optional[List[str]] = None, + prediction_method: str = "predict_proba", + **model_kwargs: Any, + ): + super().__init__( + method="random_forest", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + random_state=random_state, + prediction_method=prediction_method, + n_estimators=n_estimators, + max_depth=max_depth, + **model_kwargs, + ) + + +class XGBoostClassifierLearner(ClassificationLearner): + def __init__( + self, + n_estimators: int = 300, + max_depth: int = 6, + random_state: int = 42, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast_proba", + exclude_cols: Optional[List[str]] = None, + prediction_method: str = "predict_proba", + **model_kwargs: Any, + ): + super().__init__( + method="xgboost", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + random_state=random_state, + prediction_method=prediction_method, + n_estimators=n_estimators, + max_depth=max_depth, + **model_kwargs, + ) + + +class CatBoostClassifierLearner(ClassificationLearner): + def __init__( + self, + iterations: int = 300, + depth: int = 6, + learning_rate: float = 0.05, + random_state: int = 42, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast_proba", + exclude_cols: Optional[List[str]] = None, + prediction_method: str = "predict_proba", + **model_kwargs: Any, + ): + super().__init__( + method="catboost", + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + exclude_cols=exclude_cols, + random_state=random_state, + prediction_method=prediction_method, + iterations=iterations, + depth=depth, + learning_rate=learning_rate, + **model_kwargs, + ) + + +class MLPRegressorLearner(SKLearnWrapper): + def __init__( + self, + hidden_layer_sizes: tuple[int, ...] = (64, 32), + random_state: int = 42, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + model=MLPRegressor( + hidden_layer_sizes=hidden_layer_sizes, + random_state=random_state, + **model_kwargs, + ), + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + prediction_method="predict", + exclude_cols=exclude_cols, + ) + + +class MLPClassifierLearner(SKLearnWrapper): + def __init__( + self, + hidden_layer_sizes: tuple[int, ...] = (64, 32), + random_state: int = 42, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast_proba", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + model=MLPClassifier( + hidden_layer_sizes=hidden_layer_sizes, + random_state=random_state, + **model_kwargs, + ), + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + prediction_method="predict_proba", + exclude_cols=exclude_cols, + ) + + +class TorchMLPRegressor: + """ + Optional lightweight torch estimator with sklearn-like fit/predict. + """ + + def __init__( + self, + input_dim: Optional[int] = None, + hidden_dim: int = 64, + epochs: int = 50, + lr: float = 1e-3, + batch_size: int = 128, + random_state: int = 42, + ): + self.input_dim = input_dim + self.hidden_dim = hidden_dim + self.epochs = epochs + self.lr = lr + self.batch_size = batch_size + self.random_state = random_state + self._net = None + + def fit(self, X, y): + try: + import torch + except Exception as exc: + raise ImportError("torch is required for TorchMLPRegressor.") from exc + + X_np = np.asarray(X, dtype=np.float32) + y_np = np.asarray(y, dtype=np.float32).reshape(-1, 1) + in_dim = int(X_np.shape[1] if self.input_dim is None else self.input_dim) + + torch.manual_seed(self.random_state) + self._net = torch.nn.Sequential( + torch.nn.Linear(in_dim, self.hidden_dim), + torch.nn.ReLU(), + torch.nn.Linear(self.hidden_dim, 1), + ) + optimizer = torch.optim.Adam(self._net.parameters(), lr=self.lr) + loss_fn = torch.nn.MSELoss() + + dataset = torch.utils.data.TensorDataset( + torch.from_numpy(X_np), + torch.from_numpy(y_np), + ) + loader = torch.utils.data.DataLoader( + dataset, + batch_size=min(self.batch_size, len(dataset)), + shuffle=True, + ) + + self._net.train() + for _ in range(int(self.epochs)): + for xb, yb in loader: + optimizer.zero_grad() + pred = self._net(xb) + loss = loss_fn(pred, yb) + loss.backward() + optimizer.step() + return self + + def predict(self, X): + if self._net is None: + raise RuntimeError("TorchMLPRegressor must be fitted before predict().") + import torch + + X_np = np.asarray(X, dtype=np.float32) + self._net.eval() + with torch.no_grad(): + pred = self._net(torch.from_numpy(X_np)).cpu().numpy().reshape(-1) + return pred + + +class TorchRegressorLearner(SKLearnWrapper): + def __init__( + self, + feature_cols: Optional[List[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[List[str]] = None, + **model_kwargs: Any, + ): + super().__init__( + model=TorchMLPRegressor(**model_kwargs), + feature_cols=feature_cols, + target_col=target_col, + output_col=output_col, + prediction_method="predict", + exclude_cols=exclude_cols, + ) + + +class SupervisedPCALearner(SupervisedLearner): + """ + Pipeline-native supervised PCA learner. + + Workflow: + 1) Select a fold-local feature subset with FeatureSelector. + 2) Fit PCA on selected features. + 3) Fit regression model on principal components. + """ + + def __init__( + self, + feature_cols: Optional[Sequence[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[Sequence[str]] = None, + selection_method: str = "lasso", + n_features: int = 30, + n_components: Optional[int] = None, + selector_kwargs: Optional[dict[str, Any]] = None, + model: Optional[Any] = None, + random_state: int = 42, + ): + super().__init__( + name="SupervisedPCALearner", + description="Feature selection + PCA + supervised regression learner.", + ) + if n_features < 1: + raise ValueError("n_features must be >= 1.") + if n_components is not None and n_components < 1: + raise ValueError("n_components must be >= 1 when provided.") + + self.feature_cols = list(feature_cols) if feature_cols is not None else None + self.target_col = target_col + self.output_col = output_col + self.exclude_cols = list(exclude_cols) if exclude_cols is not None else [] + self.selection_method = selection_method + self.n_features = int(n_features) + self.n_components = n_components + self.selector_kwargs = dict(selector_kwargs or {}) + self.model = model if model is not None else LinearRegression() + self.random_state = int(random_state) + + self._resolved_feature_cols: list[str] = [] + self._selected_feature_cols: list[str] = [] + self._selector: Optional[FeatureSelector] = None + self._pca: Optional[PCA] = None + self._model = None + + @property + def inputs(self) -> List[str]: + required = list(self.feature_cols) if self.feature_cols is not None else [] + if self.target_col is not None: + required.append(self.target_col) + return required + + @staticmethod + def _safe_clone(model: Any) -> Any: + try: + return clone(model) + except Exception: + return deepcopy(model) + + def _resolve_target( + self, + df: pd.DataFrame, + y: Optional[Union[pd.Series, pd.DataFrame]], + ) -> pd.Series: + if y is not None: + y_df = to_dataframe(y).copy(deep=True) + if y_df.shape[1] != 1: + raise ValueError("y must contain exactly one target column.") + return y_df.iloc[:, 0].reindex(df.index) + + if self.target_col is None: + raise ValueError("target_col must be provided when fit(..., y=None).") + if self.target_col not in df.columns: + raise ValueError(f"target_col '{self.target_col}' not found in input columns.") + return df[self.target_col] + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "SupervisedPCALearner": + df = to_dataframe(X).copy(deep=True) + self._resolved_feature_cols = resolve_feature_columns( + df=df, + feature_cols=self.feature_cols, + exclude_cols=self.exclude_cols + [self.target_col, self.output_col], + require_numeric=True, + ) + target = self._resolve_target(df=df, y=y) + + selector = FeatureSelector( + method=self.selection_method, + feature_cols=self._resolved_feature_cols, + n_features=min(self.n_features, len(self._resolved_feature_cols)), + drop_unselected=True, + absolute_scores=True, + random_state=self.random_state, + method_kwargs=self.selector_kwargs, + ) + selector.fit(df, target) + self._selector = selector + self._selected_feature_cols = list(selector.selected_features_) + if len(self._selected_feature_cols) == 0: + raise ValueError("SupervisedPCALearner selected zero features.") + + X_selected = df[self._selected_feature_cols] + valid = X_selected.notna().all(axis=1) & target.notna() + if int(valid.sum()) == 0: + raise ValueError("No valid non-NaN rows available for SupervisedPCALearner fit.") + + X_train = X_selected.loc[valid] + y_train = target.loc[valid] + n_components = self.n_components or min(self.n_features, X_train.shape[1]) + n_components = max(1, min(int(n_components), int(X_train.shape[1]))) + + self._pca = PCA(n_components=n_components, random_state=self.random_state) + X_pcs = self._pca.fit_transform(X_train) + + self._model = self._safe_clone(self.model) + self._model.fit(X_pcs, y_train) + self._is_fitted = True + return self + + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + if not self._is_fitted: + raise RuntimeError(f"Transform '{self.name}' must be fitted before calling transform().") + if self._selector is None or self._pca is None or self._model is None: + raise RuntimeError("SupervisedPCALearner internal components are not fitted.") + + df = to_dataframe(X).copy(deep=True) + missing = set(self._selected_feature_cols) - set(df.columns) + if missing: + raise ValueError(f"Missing selected features for transform: {missing}") + + X_selected = df[self._selected_feature_cols] + valid = X_selected.notna().all(axis=1) + pred = pd.Series(np.nan, index=df.index, dtype=float) + if int(valid.sum()) > 0: + X_pcs = self._pca.transform(X_selected.loc[valid]) + pred.loc[valid] = np.asarray(self._model.predict(X_pcs), dtype=float).reshape(-1) + + df[self.output_col] = pred + return df diff --git a/src/factorlab/learning/supervised/walk_forward_learner.py b/src/factorlab/learning/supervised/walk_forward_learner.py new file mode 100644 index 0000000..ed777bb --- /dev/null +++ b/src/factorlab/learning/supervised/walk_forward_learner.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +from copy import deepcopy +from dataclasses import dataclass +from typing import Any, List, Optional, Sequence, Union + +import numpy as np +import pandas as pd +from sklearn.base import clone +from tqdm import tqdm + +from factorlab.core.base_transform import BaseTransform +from factorlab.learning.utils import extract_dates, unique_sorted_dates +from factorlab.targets.base import Target +from factorlab.core.utils.utils import to_dataframe + + +@dataclass +class FoldInfo: + train_start: pd.Timestamp + train_end: pd.Timestamp + test_start: pd.Timestamp + test_end: pd.Timestamp + n_train: int + n_test: int + + +class WalkForwardLearner(BaseTransform): + """ + Supervised forecasting transform that performs date-based walk-forward fitting. + + This class is designed to be inserted directly into ``Pipeline`` after factor + construction and before signal generation. + """ + + def __init__( + self, + model: Any, + feature_cols: Sequence[str], + target_transform: Optional[Target] = None, + target_col: Optional[str] = None, + prediction_col: str = "forecast", + window_type: str = "expanding", + min_train_periods: int = 60, + train_periods: Optional[int] = None, + retrain_interval: int = 1, + min_train_samples: int = 100, + prediction_method: str = "predict", + proba_index: int = 1, + label_lookahead: Optional[int] = None, + date_level: int = 0, + show_progress: bool = False, + ): + super().__init__( + name=f"WalkForwardLearner({model.__class__.__name__})", + description="Date-based walk-forward supervised learning step.", + ) + + if not feature_cols: + raise ValueError("feature_cols cannot be empty.") + if target_transform is None and target_col is None: + raise ValueError("Provide either target_transform or target_col.") + if window_type not in {"expanding", "rolling"}: + raise ValueError("window_type must be one of {'expanding', 'rolling'}.") + if min_train_periods < 1: + raise ValueError("min_train_periods must be >= 1.") + if retrain_interval < 1: + raise ValueError("retrain_interval must be >= 1.") + if min_train_samples < 1: + raise ValueError("min_train_samples must be >= 1.") + if window_type == "rolling" and (train_periods is None or train_periods < 1): + raise ValueError("train_periods must be >= 1 when window_type='rolling'.") + + self.model = model + self.feature_cols = list(feature_cols) + self.target_transform = target_transform + self.target_col = target_col + self.prediction_col = prediction_col + self.window_type = window_type + self.min_train_periods = min_train_periods + self.train_periods = train_periods + self.retrain_interval = retrain_interval + self.min_train_samples = min_train_samples + self.prediction_method = prediction_method + self.proba_index = proba_index + self.label_lookahead = label_lookahead + self.date_level = date_level + self.show_progress = show_progress + + self.fold_info: List[FoldInfo] = [] + + @property + def inputs(self) -> List[str]: + required = set(self.feature_cols) + if self.target_transform is not None: + required.update(self.target_transform.inputs) + elif self.target_col is not None: + required.add(self.target_col) + return sorted(required) + + @staticmethod + def _safe_clone(model: Any) -> Any: + try: + return clone(model) + except Exception: + return deepcopy(model) + + def _resolve_target_column(self, df: pd.DataFrame) -> tuple[pd.DataFrame, str]: + if self.target_transform is not None: + self.target_transform.fit(df) + out = self.target_transform.transform(df) + if not hasattr(self.target_transform, "output_col"): + raise ValueError("target_transform must expose an 'output_col' attribute.") + return out, self.target_transform.output_col + + if self.target_col is None: + raise ValueError("target_col is required when target_transform is not provided.") + if self.target_col not in df.columns: + raise ValueError(f"target_col '{self.target_col}' is missing from input data.") + return df, self.target_col + + def _predict(self, model: Any, X: pd.DataFrame) -> np.ndarray: + if self.prediction_method == "predict_proba": + if not hasattr(model, "predict_proba"): + raise AttributeError(f"{model.__class__.__name__} does not implement predict_proba().") + proba = np.asarray(model.predict_proba(X)) + if proba.ndim != 2: + raise ValueError("predict_proba output must be 2-dimensional.") + if self.proba_index >= proba.shape[1]: + raise ValueError( + f"proba_index {self.proba_index} is out of bounds for predict_proba output with " + f"{proba.shape[1]} columns." + ) + return proba[:, self.proba_index] + + if not hasattr(model, self.prediction_method): + raise AttributeError(f"{model.__class__.__name__} does not implement {self.prediction_method}().") + + pred = np.asarray(getattr(model, self.prediction_method)(X)) + if pred.ndim == 1: + return pred + if pred.ndim == 2 and pred.shape[1] == 1: + return pred.reshape(-1) + raise ValueError("Prediction output must be 1-dimensional for this forecaster.") + + def _resolve_lookahead(self) -> int: + if self.label_lookahead is not None: + if self.label_lookahead < 0: + raise ValueError("label_lookahead must be >= 0.") + return self.label_lookahead + + if self.target_transform is not None and hasattr(self.target_transform, "horizon"): + return int(getattr(self.target_transform, "horizon")) + + return 0 + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "WalkForwardLearner": + df = to_dataframe(X).copy(deep=True) + self.validate_inputs(df) + self._resolve_target_column(df) + self._is_fitted = True + return self + + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + if not self._is_fitted: + raise RuntimeError(f"Transform '{self.name}' must be fitted before calling transform().") + + df = to_dataframe(X).copy(deep=True) + self.validate_inputs(df) + df_target, target_col = self._resolve_target_column(df) + + dates = extract_dates(df.index, date_level=self.date_level) + unique_dates = unique_sorted_dates(df.index, date_level=self.date_level) + + lookahead = self._resolve_lookahead() + first_pred_idx = self.min_train_periods + lookahead + if first_pred_idx >= len(unique_dates): + raise ValueError( + f"Not enough dates for walk-forward forecasting: need more than " + f"{first_pred_idx} unique dates, got {len(unique_dates)}." + ) + + out = df.copy(deep=True) + out[self.prediction_col] = np.nan + self.fold_info = [] + + split_starts = range(first_pred_idx, len(unique_dates), self.retrain_interval) + iterator = tqdm(split_starts, desc="WalkForward Learning", disable=not self.show_progress) + + for pred_idx in iterator: + pred_dates = unique_dates[pred_idx: pred_idx + self.retrain_interval] + if pred_dates.empty: + continue + + train_end_idx = pred_idx - lookahead + if train_end_idx <= 0: + continue + + if self.window_type == "expanding": + train_start_idx = 0 + else: + train_start_idx = max(0, train_end_idx - int(self.train_periods)) + + train_dates = unique_dates[train_start_idx:train_end_idx] + if train_dates.empty: + continue + + train_mask = dates.isin(train_dates) + test_mask = dates.isin(pred_dates) + + train_df = df_target.loc[train_mask, self.feature_cols + [target_col]] + train_valid = train_df[self.feature_cols].notna().all(axis=1) & train_df[target_col].notna() + if int(train_valid.sum()) < self.min_train_samples: + continue + + X_train = train_df.loc[train_valid, self.feature_cols] + y_train = train_df.loc[train_valid, target_col] + + model = self._safe_clone(self.model) + model.fit(X_train, y_train) + + test_df = out.loc[test_mask, self.feature_cols] + test_valid = test_df.notna().all(axis=1) + if int(test_valid.sum()) > 0: + preds = self._predict(model, test_df.loc[test_valid, self.feature_cols]) + out.loc[test_df.loc[test_valid].index, self.prediction_col] = preds + + self.fold_info.append( + FoldInfo( + train_start=train_dates.min(), + train_end=train_dates.max(), + test_start=pred_dates.min(), + test_end=pred_dates.max(), + n_train=int(train_valid.sum()), + n_test=int(test_valid.sum()), + ) + ) + + return out diff --git a/src/factorlab/learning/time_series_analysis.py b/src/factorlab/learning/time_series_analysis.py index a464b83..65e1f8f 100644 --- a/src/factorlab/learning/time_series_analysis.py +++ b/src/factorlab/learning/time_series_analysis.py @@ -1,14 +1,20 @@ import pandas as pd import numpy as np -from typing import Optional, Union, Any, Callable, Dict +from contextlib import redirect_stdout +from io import StringIO +from typing import Optional, Union, Any, Callable, Dict, List, Sequence import inspect import statsmodels +import statsmodels.api as sm from statsmodels.tsa.tsatools import add_trend from statsmodels.api import OLS, RecursiveLS from statsmodels.regression.rolling import RollingOLS from statsmodels.tsa.stattools import adfuller, grangercausalitytests -# from factorlab.feature_engineering.transformations import Transform +from factorlab.core.base_transform import BaseTransform +from factorlab.learning.base import SupervisedLearner +from factorlab.learning.utils import resolve_feature_columns +from factorlab.core.utils.utils import to_dataframe def rolling_window(callable_obj: Callable, @@ -190,7 +196,12 @@ def expanding_window(callable_obj: Callable, data: Union[pd.DataFrame, np.ndarra return out -def add_lags(data: Union[pd.DataFrame, pd.Series], n_lags: int) -> pd.DataFrame: +def add_lags( + data: Union[pd.DataFrame, pd.Series], + n_lags: int, + group_level: int = 1, + include_original: bool = True, +) -> pd.DataFrame: """ Adds lags to time series data. @@ -200,6 +211,10 @@ def add_lags(data: Union[pd.DataFrame, pd.Series], n_lags: int) -> pd.DataFrame: DataFrame or Series with time series data. n_lags: int Number of lags to include in the model. + group_level: int, default 1 + Group level for MultiIndex panel data. + include_original: bool, default True + Include the unlagged input columns. Returns ------- @@ -214,21 +229,16 @@ def add_lags(data: Union[pd.DataFrame, pd.Series], n_lags: int) -> pd.DataFrame: if isinstance(data, pd.Series): data = data.to_frame() - # create emtpy list for lagged columns - lagged_cols_list = [] - - # loop through each column - for col in data.columns: + parts = [data.copy(deep=True)] if include_original else [] + for lag in range(1, int(n_lags) + 1): if isinstance(data.index, pd.MultiIndex): - lagged_cols = [data.groupby(level=1)[col].shift(lag).rename(f"{str(col)}_L{str(lag)}") - for lag in range(1, n_lags + 1)] + lagged = data.groupby(level=group_level, observed=True).shift(lag) else: - lagged_cols = [data[col].shift(lag).rename(f"{str(col)}_L{str(lag)}") - for lag in range(1, n_lags + 1)] - lagged_cols_list.extend(lagged_cols) + lagged = data.shift(lag) + lagged.columns = [f"{col}_L{lag}" for col in data.columns] + parts.append(lagged) - # concat data and lagged values - new_df = pd.concat([data] + lagged_cols_list, axis=1) + new_df = pd.concat(parts, axis=1) return new_df @@ -302,17 +312,17 @@ def _preprocess_data(self) -> pd.DataFrame: if isinstance(self.features, pd.Series): self.features = self.features.to_frame() - # # log - # if self.log: - # self.target = Transform(self.target).log() - # if self.features is not None: - # self.features = Transform(self.features).log() - # - # # diff - # if self.diff: - # self.target = Transform(self.target).diff().dropna() - # if self.features is not None: - # self.features = Transform(self.features).diff().dropna() + # log + if self.log: + self.target = np.log(self.target.clip(lower=1e-12)) + if self.features is not None: + self.features = np.log(self.features.clip(lower=1e-12)) + + # diff + if self.diff: + self.target = self.target.diff().dropna() + if self.features is not None: + self.features = self.features.diff().dropna() # concat features and target self.data = pd.concat([self.target, self.features], axis=1).dropna() @@ -944,3 +954,241 @@ def granger_causality(self, self.results.loc[f"{feature}_L{i}", 'p-val'] = res[i][0][test][1] return self.results.astype(float).round(decimals=4).sort_values(by=test, ascending=False) + + +class LagFeatures(BaseTransform): + """ + Pipeline transform for creating lagged feature columns. + """ + + def __init__( + self, + input_cols: Optional[Sequence[str]] = None, + n_lags: int = 1, + include_original: bool = True, + group_level: int = 1, + ): + super().__init__( + name=f"LagFeatures(L{n_lags})", + description="Create lagged feature columns with panel-aware grouping.", + ) + if n_lags < 1: + raise ValueError("n_lags must be >= 1.") + self.input_cols = list(input_cols) if input_cols is not None else None + self.n_lags = int(n_lags) + self.include_original = bool(include_original) + self.group_level = int(group_level) + self._resolved_input_cols: list[str] = [] + + @property + def inputs(self) -> List[str]: + return list(self.input_cols or []) + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "LagFeatures": + df = to_dataframe(X).copy(deep=True) + self._resolved_input_cols = resolve_feature_columns( + df=df, + feature_cols=self.input_cols, + exclude_cols=[], + require_numeric=True, + ) + self._is_fitted = True + return self + + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + if not self._is_fitted: + raise RuntimeError(f"Transform '{self.name}' must be fitted before calling transform().") + df = to_dataframe(X).copy(deep=True) + missing = set(self._resolved_input_cols) - set(df.columns) + if missing: + raise ValueError(f"Missing input columns for LagFeatures transform: {missing}") + + lagged = add_lags( + data=df[self._resolved_input_cols], + n_lags=self.n_lags, + group_level=self.group_level, + include_original=self.include_original, + ) + keep_cols = [c for c in lagged.columns if c not in df.columns] + if keep_cols: + df = pd.concat([df, lagged[keep_cols]], axis=1) + return df + + +class StatsmodelsOLSLearner(SupervisedLearner): + """ + OLS learner using statsmodels, exposed with fit/transform pipeline interface. + """ + + def __init__( + self, + feature_cols: Optional[Sequence[str]] = None, + target_col: Optional[str] = None, + output_col: str = "forecast", + exclude_cols: Optional[Sequence[str]] = None, + add_intercept: bool = True, + ): + super().__init__( + name="StatsmodelsOLSLearner", + description="Fit statsmodels OLS and append point predictions.", + ) + self.feature_cols = list(feature_cols) if feature_cols is not None else None + self.target_col = target_col + self.output_col = output_col + self.exclude_cols = list(exclude_cols) if exclude_cols is not None else [] + self.add_intercept = bool(add_intercept) + + self._resolved_feature_cols: list[str] = [] + self._model_result = None + + @property + def inputs(self) -> List[str]: + required = list(self.feature_cols or []) + if self.target_col is not None: + required.append(self.target_col) + return required + + def _prepare_target( + self, + df: pd.DataFrame, + y: Optional[Union[pd.Series, pd.DataFrame]], + ) -> pd.Series: + if y is not None: + y_df = to_dataframe(y).copy(deep=True) + if y_df.shape[1] != 1: + raise ValueError("y must contain exactly one target column.") + return y_df.iloc[:, 0].reindex(df.index) + if self.target_col is None: + raise ValueError("target_col must be provided when fit(..., y=None).") + if self.target_col not in df.columns: + raise ValueError(f"target_col '{self.target_col}' not found in input columns.") + return df[self.target_col] + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "StatsmodelsOLSLearner": + df = to_dataframe(X).copy(deep=True) + self._resolved_feature_cols = resolve_feature_columns( + df=df, + feature_cols=self.feature_cols, + exclude_cols=self.exclude_cols + [self.target_col, self.output_col], + require_numeric=True, + ) + target = self._prepare_target(df=df, y=y) + X_fit = df[self._resolved_feature_cols] + valid = X_fit.notna().all(axis=1) & target.notna() + if int(valid.sum()) == 0: + raise ValueError("No valid non-NaN rows available for OLS fit.") + + Xv = X_fit.loc[valid] + yv = target.loc[valid] + if self.add_intercept: + Xv = sm.add_constant(Xv, has_constant="add") + self._model_result = sm.OLS(yv, Xv).fit() + self._is_fitted = True + return self + + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + if not self._is_fitted: + raise RuntimeError(f"Transform '{self.name}' must be fitted before calling transform().") + df = to_dataframe(X).copy(deep=True) + missing = set(self._resolved_feature_cols) - set(df.columns) + if missing: + raise ValueError(f"Missing feature columns for OLS transform: {missing}") + + X_pred = df[self._resolved_feature_cols] + valid = X_pred.notna().all(axis=1) + pred = pd.Series(np.nan, index=df.index, dtype=float) + if int(valid.sum()) > 0: + Xv = X_pred.loc[valid] + if self.add_intercept: + Xv = sm.add_constant(Xv, has_constant="add") + pred.loc[valid] = self._model_result.predict(Xv) + df[self.output_col] = pred + return df + + @property + def model_result(self): + return self._model_result + + +class TimeSeriesDiagnostics: + """ + Time-series diagnostics helpers for ADF and Granger causality tests. + """ + + @staticmethod + def adf_test( + data: Union[pd.Series, pd.DataFrame], + regression: str = "c", + autolag: Optional[str] = "AIC", + ) -> pd.DataFrame: + df = to_dataframe(data).copy(deep=True) + rows: list[dict[str, float]] = [] + for col in df.columns: + s = df[col].dropna() + if s.empty: + rows.append( + { + "series": col, + "adf_stat": np.nan, + "p_value": np.nan, + "used_lag": np.nan, + "n_obs": np.nan, + } + ) + continue + stat = adfuller(s, regression=regression, autolag=autolag) + rows.append( + { + "series": col, + "adf_stat": float(stat[0]), + "p_value": float(stat[1]), + "used_lag": int(stat[2]), + "n_obs": int(stat[3]), + } + ) + return pd.DataFrame(rows).set_index("series") + + @staticmethod + def granger_causality( + target: Union[pd.Series, pd.DataFrame], + features: Union[pd.Series, pd.DataFrame], + max_lag: int = 4, + test: str = "ssr_ftest", + verbose: bool = False, + ) -> pd.DataFrame: + y = to_dataframe(target).copy(deep=True) + X = to_dataframe(features).copy(deep=True) + if y.shape[1] != 1: + raise ValueError("target must contain exactly one column.") + target_col = y.columns[0] + + rows: list[dict[str, float]] = [] + for feat in X.columns: + pair = pd.concat([y, X[[feat]]], axis=1).dropna() + if len(pair) <= max_lag + 1: + rows.append({"feature": feat, "min_p_value": np.nan, "best_lag": np.nan}) + continue + + if verbose: + out = grangercausalitytests(pair[[target_col, feat]], maxlag=max_lag, verbose=True) + else: + with redirect_stdout(StringIO()): + out = grangercausalitytests(pair[[target_col, feat]], maxlag=max_lag) + pvals = {lag: float(res[0][test][1]) for lag, res in out.items()} + best_lag = min(pvals, key=pvals.get) + rows.append( + { + "feature": feat, + "min_p_value": pvals[best_lag], + "best_lag": int(best_lag), + } + ) + return pd.DataFrame(rows).set_index("feature") diff --git a/src/factorlab/learning/unsupervised/__init__.py b/src/factorlab/learning/unsupervised/__init__.py index e69de29..6dd1b76 100644 --- a/src/factorlab/learning/unsupervised/__init__.py +++ b/src/factorlab/learning/unsupervised/__init__.py @@ -0,0 +1,17 @@ +from factorlab.learning.unsupervised.unsupervised_learning import ( + PCAWrapper, + PPCA, + R2PCA, + PCATransform, + PPCATransform, + R2PCATransform, +) + +__all__ = [ + "PCAWrapper", + "R2PCA", + "PPCA", + "PCATransform", + "PPCATransform", + "R2PCATransform", +] diff --git a/src/factorlab/learning/unsupervised/unsupervised_learning.py b/src/factorlab/learning/unsupervised/unsupervised_learning.py index 535fbf6..5cae934 100644 --- a/src/factorlab/learning/unsupervised/unsupervised_learning.py +++ b/src/factorlab/learning/unsupervised/unsupervised_learning.py @@ -3,10 +3,15 @@ from scipy.linalg import orth from scipy.optimize import linear_sum_assignment -from typing import Optional, Union, Any, Tuple +from typing import Optional, Union, Any, Tuple, Sequence, List from sklearn.decomposition import PCA from tqdm import tqdm +from factorlab.core.base_transform import BaseTransform +from factorlab.core.estimators.r2pca import R2PCA as CoreR2PCA +from factorlab.learning.utils import resolve_feature_columns +from factorlab.core.utils.utils import to_dataframe + class PCAWrapper: """ @@ -1134,3 +1139,329 @@ def get_expl_var_ratio(self) -> np.array: self.expl_var_ratio = self.eigenvals / np.sum(self.eigenvals) return self.expl_var_ratio[: self.n_components] + + +class PCATransform(BaseTransform): + """ + Pipeline-native PCA transform built on top of `PCAWrapper`. + + Notes + ----- + For rolling/expanding point-in-time behavior, wrap this transform with: + - `factorlab.core.wrappers.RollingTransform` + - `factorlab.core.wrappers.ExpandingTransform` + """ + + def __init__( + self, + input_cols: Optional[Sequence[str]] = None, + n_components: Optional[int] = None, + missing_values: str = "drop_rows", + output_prefix: str = "PC", + append: bool = True, + exclude_cols: Optional[Sequence[str]] = None, + **pca_kwargs: Any, + ): + super().__init__( + name="PCATransform", + description="Fit PCA on training rows and append component scores.", + ) + self.input_cols = list(input_cols) if input_cols is not None else None + self.n_components = n_components + self.missing_values = missing_values + self.output_prefix = output_prefix + self.append = append + self.exclude_cols = list(exclude_cols) if exclude_cols is not None else [] + self.pca_kwargs = pca_kwargs + + self._resolved_input_cols: list[str] = [] + self._output_cols: list[str] = [] + self._sign: float = 1.0 + self._pca_wrapper: Optional[PCAWrapper] = None + self.eigenvectors_: Optional[pd.DataFrame] = None + self.explained_variance_ratio_: Optional[pd.Series] = None + + @property + def inputs(self) -> List[str]: + return list(self.input_cols or []) + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "PCATransform": + df = to_dataframe(X).copy(deep=True) + self._resolved_input_cols = resolve_feature_columns( + df=df, + feature_cols=self.input_cols, + exclude_cols=self.exclude_cols, + require_numeric=True, + ) + X_fit = df[self._resolved_input_cols] + self._pca_wrapper = PCAWrapper( + data=X_fit, + n_components=self.n_components, + missing_values=self.missing_values, + **self.pca_kwargs, + ) + clean = self._pca_wrapper.data + if clean.size == 0 or clean.shape[0] == 0: + raise ValueError("No valid rows available for PCA fit after missing-value handling.") + + self._pca_wrapper.pca.fit(clean) + train_pcs = self._pca_wrapper.pca.transform(clean) + self._sign = 1.0 + if train_pcs.shape[1] > 0 and np.dot(train_pcs[:, 0], np.mean(clean, axis=1)) < 0: + self._sign = -1.0 + + n_out = int(train_pcs.shape[1]) + self._output_cols = [f"{self.output_prefix}{i + 1}" for i in range(n_out)] + eig = self._pca_wrapper.pca.components_.T * self._sign + self.eigenvectors_ = pd.DataFrame(eig, index=self._resolved_input_cols, columns=self._output_cols) + self.explained_variance_ratio_ = pd.Series( + self._pca_wrapper.pca.explained_variance_ratio_, + index=self._output_cols, + name="explained_variance_ratio", + ) + + self._is_fitted = True + return self + + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + if not self._is_fitted: + raise RuntimeError(f"Transform '{self.name}' must be fitted before calling transform().") + if self._pca_wrapper is None: + raise RuntimeError("PCA internal state is not initialized.") + + df = to_dataframe(X).copy(deep=True) + missing = set(self._resolved_input_cols) - set(df.columns) + if missing: + raise ValueError(f"Missing input columns for PCATransform: {missing}") + + X_eval = df[self._resolved_input_cols] + valid = X_eval.notna().all(axis=1) + pcs_df = pd.DataFrame(np.nan, index=df.index, columns=self._output_cols, dtype=float) + if int(valid.sum()) > 0: + pcs = self._pca_wrapper.pca.transform(X_eval.loc[valid].to_numpy(dtype=float)) * self._sign + pcs_df.loc[valid, self._output_cols] = pcs + + if self.append: + out = df.copy(deep=True) + out.loc[:, self._output_cols] = pcs_df + return out + return pcs_df + + +class PPCATransform(BaseTransform): + """ + Pipeline-native probabilistic PCA transform built on top of `PPCA`. + """ + + def __init__( + self, + input_cols: Optional[Sequence[str]] = None, + n_components: Optional[int] = None, + min_obs: int = 10, + min_feat: int = 5, + thresh: float = 1e-4, + output_prefix: str = "PC", + append: bool = True, + exclude_cols: Optional[Sequence[str]] = None, + ): + super().__init__( + name="PPCATransform", + description="Fit probabilistic PCA and append component scores.", + ) + self.input_cols = list(input_cols) if input_cols is not None else None + self.n_components = n_components + self.min_obs = int(min_obs) + self.min_feat = int(min_feat) + self.thresh = float(thresh) + self.output_prefix = output_prefix + self.append = append + self.exclude_cols = list(exclude_cols) if exclude_cols is not None else [] + + self._resolved_input_cols: list[str] = [] + self._selected_input_cols: list[str] = [] + self._output_cols: list[str] = [] + self._feature_means: Optional[pd.Series] = None + self._eigenvectors: Optional[np.ndarray] = None + self._ppca_model: Optional[PPCA] = None + self.eigenvectors_: Optional[pd.DataFrame] = None + self.explained_variance_ratio_: Optional[pd.Series] = None + + @property + def inputs(self) -> List[str]: + return list(self.input_cols or []) + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "PPCATransform": + df = to_dataframe(X).copy(deep=True) + self._resolved_input_cols = resolve_feature_columns( + df=df, + feature_cols=self.input_cols, + exclude_cols=self.exclude_cols, + require_numeric=True, + ) + X_fit = df[self._resolved_input_cols] + arr = X_fit.to_numpy(dtype=np.float64) + valid_cols = np.sum(~np.isnan(arr), axis=0) >= self.min_obs + self._selected_input_cols = [c for c, keep in zip(self._resolved_input_cols, valid_cols) if keep] + if len(self._selected_input_cols) == 0: + raise ValueError("No columns satisfy PPCA minimum-observation requirements.") + + self._ppca_model = PPCA( + data=X_fit[self._selected_input_cols], + min_obs=self.min_obs, + min_feat=self.min_feat, + n_components=self.n_components, + thresh=self.thresh, + ) + eig = self._ppca_model.get_eigenvectors() + if eig.ndim == 1: + eig = eig.reshape(-1, 1) + n_out = int(min(self._ppca_model.n_components, eig.shape[1])) + eig = eig[:, :n_out] + self._eigenvectors = eig + + means = np.asarray(self._ppca_model.feature_means, dtype=float) + if means.shape[0] != len(self._selected_input_cols): + means = np.nanmean(X_fit[self._selected_input_cols].to_numpy(dtype=float), axis=0) + self._feature_means = pd.Series(means, index=self._selected_input_cols, dtype=float) + + self._output_cols = [f"{self.output_prefix}{i + 1}" for i in range(n_out)] + self.eigenvectors_ = pd.DataFrame( + self._eigenvectors, + index=self._selected_input_cols, + columns=self._output_cols, + ) + evr = np.asarray(self._ppca_model.get_expl_var_ratio(), dtype=float).reshape(-1)[:n_out] + self.explained_variance_ratio_ = pd.Series( + evr, + index=self._output_cols, + name="explained_variance_ratio", + ) + + self._is_fitted = True + return self + + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + if not self._is_fitted: + raise RuntimeError(f"Transform '{self.name}' must be fitted before calling transform().") + if self._eigenvectors is None or self._feature_means is None: + raise RuntimeError("PPCA internal state is not initialized.") + + df = to_dataframe(X).copy(deep=True) + missing = set(self._selected_input_cols) - set(df.columns) + if missing: + raise ValueError(f"Missing input columns for PPCATransform: {missing}") + + X_eval = df[self._selected_input_cols] + centered = X_eval.sub(self._feature_means, axis=1).fillna(0.0) + pcs = centered.to_numpy(dtype=float) @ self._eigenvectors + pcs_df = pd.DataFrame(pcs, index=df.index, columns=self._output_cols) + + if self.append: + out = df.copy(deep=True) + out.loc[:, self._output_cols] = pcs_df + return out + return pcs_df + + +class R2PCATransform(BaseTransform): + """ + Pipeline-native transform wrapper over core `R2PCA` estimator. + """ + + def __init__( + self, + input_cols: Optional[Sequence[str]] = None, + n_components: Optional[int] = None, + output_prefix: str = "PC", + append: bool = True, + exclude_cols: Optional[Sequence[str]] = None, + **r2_kwargs: Any, + ): + super().__init__( + name="R2PCATransform", + description="Fit R2PCA and append component scores.", + ) + self.input_cols = list(input_cols) if input_cols is not None else None + self.n_components = n_components + self.output_prefix = output_prefix + self.append = append + self.exclude_cols = list(exclude_cols) if exclude_cols is not None else [] + self.r2_kwargs = r2_kwargs + + self._resolved_input_cols: list[str] = [] + self._output_cols: list[str] = [] + self._r2_model: Optional[CoreR2PCA] = None + self.eigenvectors_: Optional[pd.DataFrame] = None + self.explained_variance_ratio_: Optional[pd.Series] = None + + @property + def inputs(self) -> List[str]: + return list(self.input_cols or []) + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "R2PCATransform": + df = to_dataframe(X).copy(deep=True) + self._resolved_input_cols = resolve_feature_columns( + df=df, + feature_cols=self.input_cols, + exclude_cols=self.exclude_cols, + require_numeric=True, + ) + X_fit = df[self._resolved_input_cols] + + self._r2_model = CoreR2PCA(n_components=self.n_components, **self.r2_kwargs) + self._r2_model.fit(X_fit) + + n_out = int(self._r2_model.output.shape[1]) if self._r2_model.output is not None else int( + self.n_components or X_fit.shape[1] + ) + self._output_cols = [f"{self.output_prefix}{i + 1}" for i in range(n_out)] + + if self._r2_model.eigenvecs is not None: + eig = self._r2_model.eigenvecs.copy(deep=True) + eig.columns = self._output_cols[: eig.shape[1]] + self.eigenvectors_ = eig + if self._r2_model.expl_var_ratio is not None: + if isinstance(self._r2_model.expl_var_ratio, pd.DataFrame): + vals = self._r2_model.expl_var_ratio.iloc[-1].to_numpy(dtype=float) + else: + vals = np.asarray(self._r2_model.expl_var_ratio, dtype=float).reshape(-1) + self.explained_variance_ratio_ = pd.Series( + vals[: len(self._output_cols)], + index=self._output_cols, + name="explained_variance_ratio", + ) + + self._is_fitted = True + return self + + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + if not self._is_fitted: + raise RuntimeError(f"Transform '{self.name}' must be fitted before calling transform().") + if self._r2_model is None: + raise RuntimeError("R2PCA internal state is not initialized.") + + df = to_dataframe(X).copy(deep=True) + missing = set(self._resolved_input_cols) - set(df.columns) + if missing: + raise ValueError(f"Missing input columns for R2PCATransform: {missing}") + + pcs = self._r2_model.transform(df[self._resolved_input_cols]).copy(deep=True) + pcs.columns = self._output_cols[: pcs.shape[1]] + + if self.append: + out = df.copy(deep=True) + out.loc[:, pcs.columns] = pcs + return out + return pcs diff --git a/src/factorlab/learning/utils.py b/src/factorlab/learning/utils.py new file mode 100644 index 0000000..148cd96 --- /dev/null +++ b/src/factorlab/learning/utils.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +from typing import Optional, Sequence, Union + +import numpy as np +import pandas as pd + + +def extract_dates(index: pd.Index, date_level: int = 0) -> pd.DatetimeIndex: + """ + Extract a DatetimeIndex from an index or MultiIndex. + """ + if isinstance(index, pd.MultiIndex): + dates = index.get_level_values(date_level) + else: + dates = index + + if not pd.api.types.is_datetime64_any_dtype(dates): + dates = pd.to_datetime(dates) + + return pd.DatetimeIndex(dates) + + +def unique_sorted_dates(index: pd.Index, date_level: int = 0) -> pd.DatetimeIndex: + """ + Return unique sorted dates from an index or MultiIndex. + """ + return pd.DatetimeIndex(pd.Index(extract_dates(index, date_level=date_level)).unique()).sort_values() + + +def validate_panel_xy( + X: pd.DataFrame, + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + date_level: int = 0, + require_multiindex: bool = True, +) -> None: + """ + Validate panel-style X/y inputs. + """ + if not isinstance(X, pd.DataFrame): + raise TypeError("X must be a pandas DataFrame.") + + if require_multiindex and not isinstance(X.index, pd.MultiIndex): + raise ValueError("X must use a MultiIndex with a datetime date level.") + + _ = extract_dates(X.index, date_level=date_level) + if not X.index.is_monotonic_increasing: + raise ValueError("X index must be sorted in increasing order before splitting.") + + if X.index.has_duplicates: + raise ValueError("X index contains duplicates. Remove duplicates before splitting.") + + if y is None: + return + + if not isinstance(y, (pd.Series, pd.DataFrame)): + raise TypeError("y must be a pandas Series or DataFrame.") + if not X.index.equals(y.index): + raise ValueError("X and y must share the exact same index.") + if not y.index.is_monotonic_increasing: + raise ValueError("y index must be sorted in increasing order before splitting.") + if y.index.has_duplicates: + raise ValueError("y index contains duplicates. Remove duplicates before splitting.") + _ = extract_dates(y.index, date_level=date_level) + + +def apply_purge( + train_dates: pd.DatetimeIndex, + unique_dates: pd.DatetimeIndex, + test_start: pd.Timestamp, + lookahead: int = 0, + embargo: int = 0, +) -> pd.DatetimeIndex: + """ + Purge training dates whose labels can overlap with the test window. + + The cutoff rule is: + train_date_position < test_start_position - lookahead - embargo + """ + if lookahead < 0: + raise ValueError("lookahead must be >= 0.") + if embargo < 0: + raise ValueError("embargo must be >= 0.") + + if len(train_dates) == 0: + return train_dates + + test_start_pos = int(unique_dates.get_loc(test_start)) + cutoff = test_start_pos - int(lookahead) - int(embargo) + if cutoff <= 0: + return train_dates[:0] + + allowed = set(unique_dates[:cutoff]) + return pd.DatetimeIndex([d for d in train_dates if d in allowed]) + + +def mask_from_dates(dates: pd.DatetimeIndex, selected_dates: pd.DatetimeIndex) -> np.ndarray: + """ + Build a boolean mask selecting rows whose dates are in selected_dates. + """ + return dates.isin(selected_dates) + + +def resolve_feature_columns( + df: pd.DataFrame, + feature_cols: Optional[Sequence[str]] = None, + exclude_cols: Optional[Sequence[str]] = None, + require_numeric: bool = True, +) -> list[str]: + """ + Resolve feature columns with consistent validation rules. + """ + if feature_cols is None: + if require_numeric: + cols = list(df.select_dtypes(include=[np.number]).columns) + else: + cols = list(df.columns) + else: + cols = list(feature_cols) + missing = set(cols) - set(df.columns) + if missing: + raise ValueError(f"Missing feature columns: {missing}") + + exclude = {c for c in (exclude_cols or []) if c is not None} + cols = [c for c in cols if c not in exclude] + if len(cols) == 0: + raise ValueError("No valid feature columns resolved after exclusions.") + return cols diff --git a/src/factorlab/portfolio/construction/factor_mimicking.py b/src/factorlab/portfolio/construction/factor_mimicking.py index 0b81bed..0d52a0c 100644 --- a/src/factorlab/portfolio/construction/factor_mimicking.py +++ b/src/factorlab/portfolio/construction/factor_mimicking.py @@ -1,11 +1,11 @@ import pandas as pd import numpy as np from typing import Optional, Union +from sklearn.linear_model import LinearRegression -from factorlab.feature_engineering.transformations import Transform -from factorlab.forecasting.time_series_analysis import rolling_window, expanding_window - -from factorlab.forecasting.supervised_learning import SPCA +from factorlab.learning.selectors import FeatureSelector +from factorlab.learning.time_series_analysis import add_lags +from factorlab.learning.unsupervised.unsupervised_learning import PCAWrapper class FMP: @@ -42,6 +42,110 @@ def __init__(self, self.factor_cols = None self.return_cols = None + @staticmethod + def _orthogonalize_dataframe(df: pd.DataFrame) -> pd.DataFrame: + """ + Orthogonalize factor columns via sequential residualization. + """ + out = pd.DataFrame(index=df.index, columns=df.columns, dtype=float) + cols = list(df.columns) + for i, col in enumerate(cols): + y = df[col].astype(float) + if i == 0: + out[col] = y + continue + + X_prev = out.iloc[:, :i] + valid = X_prev.notna().all(axis=1) & y.notna() + if int(valid.sum()) <= i: + out[col] = y + continue + + beta = np.linalg.lstsq(X_prev.loc[valid].to_numpy(), y.loc[valid].to_numpy(), rcond=None)[0] + pred = pd.Series(np.nan, index=df.index, dtype=float) + pred.loc[valid] = X_prev.loc[valid].to_numpy() @ beta + out[col] = y - pred + return out + + @staticmethod + def _target_vol(df: pd.DataFrame, ann_vol: float, ann_factor: int) -> pd.DataFrame: + """ + Scale each series to the target annualized volatility. + """ + if ann_factor is None or ann_factor <= 0: + return df + vol = df.std(ddof=0) * np.sqrt(float(ann_factor)) + scale = pd.Series(1.0, index=df.columns, dtype=float) + valid = vol > 0 + scale.loc[valid] = float(ann_vol) / vol.loc[valid] + return df.mul(scale, axis=1) + + @staticmethod + def _expanding_linear_prediction( + target: pd.Series, + features: pd.DataFrame, + min_obs: Optional[int] = None, + ) -> pd.DataFrame: + """ + Generate expanding-window one-step-ahead predictions. + """ + df = pd.concat([target.rename("target"), features], axis=1).dropna() + if df.empty: + return pd.DataFrame(columns=["pred"], index=target.index, dtype=float) + + if min_obs is None: + min_obs = max(2, features.shape[1] + 1) + min_obs = max(2, int(min_obs)) + if min_obs >= len(df): + return pd.DataFrame(columns=["pred"], index=df.index, dtype=float) + + pred = pd.Series(np.nan, index=df.index, dtype=float) + for row in range(min_obs, len(df)): + train = df.iloc[:row] + X_train = train.iloc[:, 1:] + y_train = train.iloc[:, 0] + X_next = df.iloc[[row], 1:] + model = LinearRegression() + model.fit(X_train, y_train) + pred.iloc[row] = float(model.predict(X_next)[0]) + + return pred.to_frame(name="pred") + + @staticmethod + def _supervised_expanding_pcs( + target: pd.Series, + features: pd.DataFrame, + n_feat: int = 30, + method: str = "lasso", + ) -> pd.DataFrame: + """ + Select top predictive features and compute expanding PCs. + """ + target_name = target.name if target.name is not None else "target" + df = pd.concat([features, target.rename(target_name)], axis=1).dropna() + if df.empty: + return pd.DataFrame(index=features.index) + + selector = FeatureSelector( + method=method, + feature_cols=list(features.columns), + target_col=target_name, + n_features=min(int(n_feat), int(features.shape[1])), + drop_unselected=True, + ) + selector.fit(df) + selected = selector.transform(df).drop(columns=[target_name], errors="ignore") + if selected.empty: + return pd.DataFrame(index=df.index) + + n_components = max(1, min(selected.shape[0], selected.shape[1])) + min_obs = max(2, n_components) + pcs = PCAWrapper(selected, n_components=n_components).get_expanding_pcs(min_obs=min_obs) + if isinstance(pcs, np.ndarray): + pcs = pd.DataFrame(pcs, index=selected.index[-pcs.shape[0]:]) + pcs.columns = [f"PC{i + 1}" for i in range(pcs.shape[1])] + return pcs + def orthogonalize_factors(self, window_type: str = 'fixed', min_obs: int = 12, window_size: int = 36) -> None: """ Orthogonalize factors. @@ -55,12 +159,32 @@ def orthogonalize_factors(self, window_type: str = 'fixed', min_obs: int = 12, w window_size: int, default=36 Window size for rolling window. """ + factors_df = self.factors.copy(deep=True) if isinstance(self.factors, pd.DataFrame) else pd.DataFrame(self.factors) + if window_type == 'rolling': - self.factors = rolling_window(Transform, self.factors, method='orthogonalize', window_size=window_size) + if window_size < 1 or window_size > len(factors_df): + raise ValueError("window_size must be within [1, len(factors)].") + rows = [] + idx = [] + for row in range(window_size, len(factors_df) + 1): + window_df = factors_df.iloc[row - window_size: row] + orth_window = self._orthogonalize_dataframe(window_df) + rows.append(orth_window.iloc[-1]) + idx.append(window_df.index[-1]) + self.factors = pd.DataFrame(rows, index=idx, columns=factors_df.columns) elif window_type == 'expanding': - self.factors = expanding_window(Transform, self.factors, method='orthogonalize', min_obs=min_obs) + if min_obs < 1 or min_obs > len(factors_df): + raise ValueError("min_obs must be within [1, len(factors)].") + rows = [] + idx = [] + for row in range(min_obs, len(factors_df) + 1): + window_df = factors_df.iloc[:row] + orth_window = self._orthogonalize_dataframe(window_df) + rows.append(orth_window.iloc[-1]) + idx.append(window_df.index[-1]) + self.factors = pd.DataFrame(rows, index=idx, columns=factors_df.columns) else: - self.factors = Transform(self.factors).orthogonalize() + self.factors = self._orthogonalize_dataframe(factors_df) def adj_factor_vol(self, ann_vol: int = 0.15) -> pd.DataFrame: """ @@ -79,21 +203,29 @@ def adj_factor_vol(self, ann_vol: int = 0.15) -> pd.DataFrame: # ann factor self.get_ann_factor() # adj to target vol` - self.factors = Transform(self.factors).target_vol(ann_vol=ann_vol, ann_factor=self.ann_factor) + factors_df = self.factors.copy(deep=True) if isinstance(self.factors, pd.DataFrame) else pd.DataFrame(self.factors) + self.factors = self._target_vol(factors_df, ann_vol=ann_vol, ann_factor=int(self.ann_factor)) def preprocess_data(self) -> None: """ Preprocess data. """ - if isinstance(self.returns, pd.Series) or isinstance(self.returns, pd.DataFrame) and \ - isinstance(self.factors, pd.DataFrame): - self.data = pd.concat([self.returns, self.factors], axis=1).dropna() + if isinstance(self.returns, (pd.Series, pd.DataFrame)) and isinstance(self.factors, pd.DataFrame): + returns_df = self.returns.to_frame() if isinstance(self.returns, pd.Series) else self.returns.copy(deep=True) + factors_df = self.factors.copy(deep=True) + self.return_cols = list(returns_df.columns) + self.factor_cols = list(factors_df.columns) + + self.data = pd.concat([returns_df, factors_df], axis=1).dropna() self.index = self.data.index - self.factors = self.data.iloc[:, 1:] - self.returns = self.data.iloc[:, 0] + self.returns = self.data.loc[:, self.return_cols] + self.factors = self.data.loc[:, self.factor_cols] elif isinstance(self.returns, np.ndarray) and isinstance(self.factors, np.ndarray): n = min(self.returns.shape[0], self.factors.shape[0]) - self.data = np.concatenate([self.returns[:n].reshape(-1, 1), self.factors[:n].reshape(-1, 1)], axis=1) + factors_arr = self.factors[:n] + if factors_arr.ndim == 1: + factors_arr = factors_arr.reshape(-1, 1) + self.data = np.concatenate([self.returns[:n].reshape(-1, 1), factors_arr], axis=1) self.factors = self.data[:, 1:] self.returns = self.data[:, 0] else: @@ -108,9 +240,9 @@ def get_ann_factor(self) -> str: ann_factor: int Annualization factor. """ - if self.ann_factor is None and isinstance(self.returns, pd.DataFrame) or isinstance(self.returns, pd.Series): + if self.ann_factor is None and isinstance(self.returns, (pd.DataFrame, pd.Series)): # infer freq - if isinstance(self.returns, pd.MultiIndex): + if isinstance(self.returns.index, pd.MultiIndex): freq = pd.infer_freq(self.returns.index.levels[0]) else: freq = pd.infer_freq(self.returns.index) @@ -149,8 +281,8 @@ def add_lags(self, n_lags: int = 24) -> pd.DataFrame: n_lags: int, default=24 Number of lags to add to pct changes. """ - # convert to pct chg - self.convert_to_pct_chg() + if self.pct_chg is None: + self.convert_to_pct_chg() self.pct_chg = add_lags(self.pct_chg, n_lags=n_lags) def predict_factors(self, @@ -176,6 +308,9 @@ def predict_factors(self, fwd: int, default=0 Number of periods to shift returns forward. """ + # align and clean data + self.preprocess_data() + # orthogonalize factors self.orthogonalize_factors(window_type=window_type) # adj factors to target vol @@ -187,12 +322,22 @@ def predict_factors(self, # iterate over factors for factor in self.factors.columns: - # targeted pca - pcs = SPCA(self.factors[factor].shift(fwd*-1), self.pct_chg, n_feat=30).get_expanding_pcs() - # predict factor - pred = linear_reg(self.factors[factor].shift(fwd*-1), pcs, window_type='expanding') - # add to df - self.factors_pred = pd.concat([self.factors_pred, pred.rename(columns={'pred': factor})], axis=1) + target = self.factors[factor].shift(-int(fwd)) if int(fwd) > 0 else self.factors[factor] + pcs = self._supervised_expanding_pcs( + target=target, + features=self.pct_chg, + n_feat=30, + method="lasso", + ) + if pcs.empty: + continue + + pred = self._expanding_linear_prediction( + target=target.reindex(pcs.index), + features=pcs, + min_obs=max(2, pcs.shape[1] + 1), + ) + self.factors_pred = pd.concat([self.factors_pred, pred.rename(columns={"pred": factor})], axis=1) def factor_exposures(self) -> pd.DataFrame: """ @@ -205,8 +350,15 @@ def factor_exposures(self) -> pd.DataFrame: betas_df = pd.DataFrame(index=idx, columns=factor_cols) # iterate over ret columns for col in self.returns.columns: - betas = linear_reg(self.returns[col], self.factors_pred, output='coef') - betas_df.loc[(date, col), :] = betas.values.T + df = pd.concat([self.returns[col].rename("ret"), self.factors_pred], axis=1).dropna() + if df.empty: + continue + X = df.iloc[:, 1:] + y = df.iloc[:, 0] + model = LinearRegression() + model.fit(X, y) + betas_df.loc[(date, col), self.factors_pred.columns] = model.coef_ + betas_df.loc[(date, col), "const"] = model.intercept_ # set betas self.betas = betas_df diff --git a/src/factorlab/strategy/strategy.py b/src/factorlab/strategy/strategy.py index 61dfa2d..0304aa5 100644 --- a/src/factorlab/strategy/strategy.py +++ b/src/factorlab/strategy/strategy.py @@ -1,10 +1,28 @@ from factorlab.core.pipeline import Pipeline +from factorlab.learning.splitters import BasePanelSplit from factorlab.portfolio.optimization.base import PortfolioOptimizerBase from factorlab.portfolio.cost_models.base import CostModelBase -from typing import Union +from factorlab.targets.forward import ForwardTargetSpec +from typing import Optional, Union +from dataclasses import dataclass +import pandas as pd -class Strategy: +@dataclass +class LearningSpec: + """ + Optional walk-forward learning configuration for strategy-level orchestration. + """ + + splitter: BasePanelSplit + y: Optional[Union[str, pd.Series, pd.DataFrame]] = None + target_spec: Optional[ForwardTargetSpec] = None + strict_temporal: bool = True + date_level: int = 0 + show_progress: bool = False + + +class StrategySpec: """ Strategy Specification class that encapsulates the definition of a trading strategy. It defines the combination of research components (Signal, Optimizer) @@ -19,6 +37,7 @@ def __init__(self, data_pipeline: Pipeline, optimizer: PortfolioOptimizerBase, cost_model: CostModelBase, + learning_spec: Optional[LearningSpec] = None, rebal_freq: Union[str, int] = 'd'): """ Parameters @@ -41,9 +60,14 @@ def __init__(self, self.data_pipeline = data_pipeline self.optimizer = optimizer self.cost_model = cost_model + self.learning_spec = learning_spec self.rebal_freq = rebal_freq # add other global constraints here, like max_leverage, # max_turnover, etc., as the strategy becomes more complex. print(f"Strategy Config '{self.name}' created.") + + +# Backward-compat alias — remove after notebooks updated to StrategySpec +Strategy = StrategySpec diff --git a/src/factorlab/targets/__init__.py b/src/factorlab/targets/__init__.py index e69de29..60505a0 100644 --- a/src/factorlab/targets/__init__.py +++ b/src/factorlab/targets/__init__.py @@ -0,0 +1,9 @@ +from factorlab.targets.forward import ForwardDirectionTarget, ForwardReturnTarget, ForwardTargetSpec +from factorlab.targets.expected_returns import ReturnEstimators + +__all__ = [ + "ForwardReturnTarget", + "ForwardDirectionTarget", + "ForwardTargetSpec", + "ReturnEstimators", +] diff --git a/src/factorlab/targets/forward.py b/src/factorlab/targets/forward.py new file mode 100644 index 0000000..6822018 --- /dev/null +++ b/src/factorlab/targets/forward.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Callable, List, Optional, Union + +import numpy as np +import pandas as pd + +from factorlab.learning.utils import extract_dates +from factorlab.targets.base import Target +from factorlab.core.utils.utils import to_dataframe + + +class ForwardReturnTarget(Target): + """ + Build a forward return target column from a price column. + + This target is designed for supervised forecasting in walk-forward workflows. + """ + + def __init__( + self, + input_col: str = "close", + output_col: str = "target", + horizon: int = 1, + method: str = "pct", + group_level: int = 1, + ): + super().__init__( + name="ForwardReturnTarget", + description="Forward return target from a price column.", + ) + if horizon < 1: + raise ValueError("horizon must be >= 1.") + if method not in {"pct", "log", "diff"}: + raise ValueError("method must be one of {'pct', 'log', 'diff'}.") + + self.input_col = input_col + self.output_col = output_col + self.horizon = int(horizon) + self.method = method + self.group_level = group_level + + @property + def inputs(self) -> List[str]: + return [self.input_col] + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "ForwardReturnTarget": + df = to_dataframe(X) + self.validate_inputs(df) + self._is_fitted = True + return self + + def _forward_price(self, df: pd.DataFrame) -> pd.Series: + price = df[self.input_col] + if isinstance(df.index, pd.MultiIndex): + return price.groupby(level=self.group_level, observed=True).shift(-self.horizon) + return price.shift(-self.horizon) + + def _compute_target(self, df: pd.DataFrame) -> pd.Series: + price = df[self.input_col] + fwd_price = self._forward_price(df) + + if self.method == "pct": + target = fwd_price.div(price) - 1.0 + elif self.method == "log": + ratio = fwd_price.div(price) + target = np.log(ratio.where(ratio > 0)) + else: + target = fwd_price - price + + return target.replace([np.inf, -np.inf], np.nan) + + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + if not self._is_fitted: + raise RuntimeError(f"Transform '{self.name}' must be fitted before calling transform().") + + df = to_dataframe(X).copy(deep=True) + self.validate_inputs(df) + df[self.output_col] = self._compute_target(df) + return df + + +class ForwardDirectionTarget(ForwardReturnTarget): + """ + Build a binary forward-direction target from a forward return. + """ + + def __init__( + self, + input_col: str = "close", + output_col: str = "target", + horizon: int = 1, + threshold: float = 0.0, + group_level: int = 1, + ): + super().__init__( + input_col=input_col, + output_col=output_col, + horizon=horizon, + method="pct", + group_level=group_level, + ) + self.name = "ForwardDirectionTarget" + self.description = "Binary target from forward returns." + self.threshold = float(threshold) + + def _compute_target(self, df: pd.DataFrame) -> pd.Series: + fwd_ret = super()._compute_target(df) + out = pd.Series(np.nan, index=fwd_ret.index, dtype=float) + valid = fwd_ret.notna() + out.loc[valid] = (fwd_ret.loc[valid] > self.threshold).astype(float) + return out + + +@dataclass(frozen=True) +class ForwardTargetSpec: + """ + Declarative target specification for fold-local walk-forward training. + + The spec can build a target series on demand and compute the rows that are + safe to use for training at a given fold boundary. + """ + + input_col: str = "close" + output_col: str = "target" + horizon: int = 1 + kind: str = "return" + method: str = "pct" + threshold: float = 0.0 + group_level: int = 1 + clip_lower: Optional[float] = None + clip_upper: Optional[float] = None + label_func: Optional[Callable[[pd.Series], pd.Series]] = None + + def __post_init__(self) -> None: + if self.horizon < 1: + raise ValueError("ForwardTargetSpec.horizon must be >= 1.") + if self.kind not in {"return", "direction"}: + raise ValueError("ForwardTargetSpec.kind must be one of {'return', 'direction'}.") + if self.kind == "return" and self.method not in {"pct", "log", "diff"}: + raise ValueError("ForwardTargetSpec.method must be one of {'pct', 'log', 'diff'}.") + if self.clip_lower is not None and self.clip_upper is not None and self.clip_lower > self.clip_upper: + raise ValueError("clip_lower must be <= clip_upper when both are set.") + + def _build_transform(self) -> Target: + if self.kind == "direction": + return ForwardDirectionTarget( + input_col=self.input_col, + output_col=self.output_col, + horizon=self.horizon, + threshold=self.threshold, + group_level=self.group_level, + ) + return ForwardReturnTarget( + input_col=self.input_col, + output_col=self.output_col, + horizon=self.horizon, + method=self.method, + group_level=self.group_level, + ) + + def build(self, X: Union[pd.Series, pd.DataFrame]) -> pd.Series: + """ + Build the target series for the provided data. + """ + df = to_dataframe(X).copy(deep=True) + transform = self._build_transform() + transform.fit(df) + out = transform.transform(df)[self.output_col] + + if self.clip_lower is not None or self.clip_upper is not None: + out = out.clip(lower=self.clip_lower, upper=self.clip_upper) + if self.label_func is not None: + out = self.label_func(out) + + return out.rename(self.output_col) + + def label_end_dates(self, index: pd.Index, date_level: int = 0) -> pd.Series: + """ + Return the timestamp used by each row's forward label. + """ + if isinstance(index, pd.MultiIndex): + row_dates = pd.Series(extract_dates(index, date_level=date_level), index=index) + return row_dates.groupby(level=self.group_level, observed=True).shift(-self.horizon) + + row_dates = pd.Series(extract_dates(index, date_level=date_level), index=index) + return row_dates.shift(-self.horizon) + + def trainable_mask( + self, + index: pd.Index, + train_index: pd.Index, + date_level: int = 0, + ) -> np.ndarray: + """ + Return a boolean mask for rows safe to train on at this fold. + + A row is trainable if: + - It belongs to the fold's train_index. + - Its forward label end-date is defined. + - Its forward label end-date is not after the fold train_end date. + """ + if len(train_index) == 0: + return np.zeros(len(index), dtype=bool) + + train_dates = extract_dates(train_index, date_level=date_level) + train_end = pd.Timestamp(train_dates.max()) + label_end = self.label_end_dates(index=index, date_level=date_level) + + return ( + pd.Index(index).isin(train_index) + & label_end.notna().to_numpy() + & (pd.DatetimeIndex(label_end.values) <= train_end) + ) diff --git a/tests/learning/test_crypto_pipeline_strategy_integration.py b/tests/learning/test_crypto_pipeline_strategy_integration.py new file mode 100644 index 0000000..d746139 --- /dev/null +++ b/tests/learning/test_crypto_pipeline_strategy_integration.py @@ -0,0 +1,450 @@ +from pathlib import Path + +import pandas as pd +import pytest +from sklearn.linear_model import Ridge +from sklearn.linear_model import LinearRegression + +from factorlab.analytics.performance import Performance +from factorlab.backtesting.engine import BacktestEngine +from factorlab.core.pipeline import Pipeline +from factorlab.factors.library.liquidity.liquidity import Liquidity +from factorlab.factors.library.trend.trend import Trend +from factorlab.factors.library.volatility.volatility import Vol +from factorlab.features.transforms.returns import Returns +from factorlab.learning import ( + ExpandingIncrementPanelSplit, + FeatureSelector, + RegressionLearner, + WalkForwardLearner, +) +from factorlab.learning.sklearn_wrapper import SKLearnWrapper +from factorlab.portfolio.cost_models.fixed import FixedCommissionModel +from factorlab.portfolio.optimization.signal_weighted import SignalWeighted +from factorlab.signals.generator import SignalGenerator +from factorlab.strategy.strategy import LearningSpec, StrategySpec +from factorlab.targets import ForwardReturnTarget, ForwardTargetSpec + + +def _find_real_crypto_ohlc_path() -> Path | None: + repo_root = Path(__file__).resolve().parents[2] + candidates = [ + repo_root.parent / "data" / "systamental" / "crypto" / "markets" / "perpetual_futures" / "daily" / "ohlcv" / "clean" / "binance.parquet", + repo_root.parent / "data" / "systamental" / "crypto" / "markets" / "spot" / "daily" / "ohlcv" / "clean" / "binance.parquet", + repo_root / "data" / "systamental" / "crypto" / "markets" / "perpetual_futures" / "daily" / "ohlcv" / "clean" / "binance.parquet", + repo_root / "data" / "systamental" / "crypto" / "markets" / "spot" / "daily" / "ohlcv" / "clean" / "binance.parquet", + ] + for path in candidates: + if path.exists(): + return path + return None + + +def _load_real_crypto_panel( + n_days: int = 320, + preferred_tickers: tuple[str, ...] = ("BTC", "ETH", "XRP", "ADA", "LINK"), +) -> pd.DataFrame | None: + path = _find_real_crypto_ohlc_path() + if path is None: + return None + + df = pd.read_parquet(path) + if not isinstance(df.index, pd.MultiIndex): + return None + if "close" not in df.columns or "volume" not in df.columns: + return None + + if "funding_rate" not in df.columns: + df = df.copy() + df["funding_rate"] = 0.0 + + df = df[["close", "volume", "funding_rate"]].sort_index() + if df.index.names != ["date", "ticker"]: + df.index = df.index.set_names(["date", "ticker"]) + + available_tickers = set(df.index.get_level_values("ticker").unique()) + selected = [ticker for ticker in preferred_tickers if ticker in available_tickers] + if len(selected) < 3: + counts = df.groupby(level="ticker").size().sort_values(ascending=False) + selected = list(counts.head(5).index) + if len(selected) < 3: + return None + + df = df.loc[(slice(None), selected), :].sort_index() + end_date = df.index.get_level_values("date").max() + start_date = end_date - pd.Timedelta(days=n_days - 1) + date_idx = df.index.get_level_values("date") + df = df[(date_idx >= start_date) & (date_idx <= end_date)].sort_index() + + df = df[~df.index.duplicated(keep="last")] + df = df.dropna(subset=["close", "volume"]) + if df.empty or df.index.get_level_values("ticker").nunique() < 3: + return None + return df + + +@pytest.fixture(scope="module") +def crypto_panel() -> pd.DataFrame: + real_data = _load_real_crypto_panel() + if real_data is None: + pytest.skip("Real crypto OHLCV parquet data not available under ../data or ./data.") + return real_data + + +def test_crypto_factor_to_learning_to_strategy_evaluation(crypto_panel: pd.DataFrame): + data = crypto_panel.copy() + + pipeline = Pipeline( + steps=[ + ( + "returns", + Returns(method="pct", input_col="close", output_col="ret", lags=1), + ), + ( + "trend", + Trend( + method="price_momentum", + input_col="close", + window_size=5, + scale=False, + ), + ), + ( + "volatility", + Vol( + method="std", + input_col="ret", + window_type="rolling", + window_size=20, + annualize=False, + ), + ), + ( + "liquidity", + Liquidity( + method="amihud", + return_col="ret", + price_col="close", + volume_col="volume", + ), + ), + ( + "learner", + WalkForwardLearner( + model=Ridge(alpha=1.0), + feature_cols=["PriceMomentum_5", "STD_rolling_20", "AmihudIlliquidity"], + target_transform=ForwardReturnTarget( + input_col="close", + output_col="fwd_ret", + horizon=1, + ), + prediction_col="forecast", + window_type="expanding", + min_train_periods=45, + retrain_interval=1, + min_train_samples=150, + ), + ), + ( + "signal", + SignalGenerator( + signal_type="sign", + input_col="forecast", + output_col="signal", + ), + ), + ] + ) + + strategy = StrategySpec( + name="CryptoLearningIntegration", + data_pipeline=pipeline, + optimizer=SignalWeighted(window_size=20), + cost_model=FixedCommissionModel(rate=0.0005), + rebal_freq="d", + ) + + engine = BacktestEngine( + config=strategy, + data=data, + initial_capital=1_000_000.0, + verbose=False, + ann_factor=365, + ) + + results = engine.run_backtest() + assert results + assert "account_value" in results + assert "weights" in results + assert len(results["account_value"]) > 10 + assert results["account_value"].iloc[-1] > 0 + + strategy_returns = engine.portfolio_return.dropna().to_frame("strategy") + perf_table = Performance(strategy_returns, ret_type="simple", ann_factor=365).get_table(metrics="key_metrics") + assert not perf_table.empty + assert "Sharpe ratio" in perf_table.columns + + +def test_crypto_strategy_with_walk_forward_runner_learning_spec(crypto_panel: pd.DataFrame): + data = crypto_panel.copy() + + pipeline = Pipeline( + steps=[ + ( + "returns", + Returns(method="pct", input_col="close", output_col="ret", lags=1), + ), + ( + "target", + ForwardReturnTarget(input_col="close", output_col="target", horizon=1), + ), + ( + "learner", + SKLearnWrapper( + model=LinearRegression(), + feature_cols=["ret"], + target_col="target", + output_col="forecast", + ), + ), + ( + "signal", + SignalGenerator( + signal_type="sign", + input_col="forecast", + output_col="signal", + ), + ), + ] + ) + + learning_spec = LearningSpec( + splitter=ExpandingIncrementPanelSplit( + train_intervals=1, + test_size=1, + min_train_periods=45, + lookahead=1, + embargo=0, + ), + date_level=0, + show_progress=False, + ) + + strategy = StrategySpec( + name="CryptoLearningSpecIntegration", + data_pipeline=pipeline, + optimizer=SignalWeighted(window_size=20), + cost_model=FixedCommissionModel(rate=0.0005), + learning_spec=learning_spec, + rebal_freq="d", + ) + + engine = BacktestEngine( + config=strategy, + data=data, + initial_capital=1_000_000.0, + verbose=False, + ann_factor=365, + ) + + results = engine.run_backtest() + assert results + assert "account_value" in results + assert engine.walk_forward_runner is not None + assert len(engine.walk_forward_runner.fold_info) > 0 + + +def test_crypto_strategy_learning_spec_with_target_spec(crypto_panel: pd.DataFrame): + data = crypto_panel.copy() + + pipeline = Pipeline( + steps=[ + ( + "returns", + Returns(method="pct", input_col="close", output_col="ret", lags=1), + ), + ( + "learner", + SKLearnWrapper( + model=LinearRegression(), + feature_cols=["ret"], + target_col=None, + output_col="forecast", + ), + ), + ( + "signal", + SignalGenerator( + signal_type="sign", + input_col="forecast", + output_col="signal", + ), + ), + ] + ) + + learning_spec = LearningSpec( + splitter=ExpandingIncrementPanelSplit( + train_intervals=1, + test_size=1, + min_train_periods=45, + lookahead=1, + embargo=0, + ), + target_spec=ForwardTargetSpec( + input_col="close", + output_col="target", + horizon=1, + kind="return", + method="pct", + group_level=1, + ), + date_level=0, + show_progress=False, + ) + + strategy = StrategySpec( + name="CryptoLearningSpecTargetSpecIntegration", + data_pipeline=pipeline, + optimizer=SignalWeighted(window_size=20), + cost_model=FixedCommissionModel(rate=0.0005), + learning_spec=learning_spec, + rebal_freq="d", + ) + + engine = BacktestEngine( + config=strategy, + data=data, + initial_capital=1_000_000.0, + verbose=False, + ann_factor=365, + ) + + results = engine.run_backtest() + assert results + assert "account_value" in results + assert engine.walk_forward_runner is not None + assert len(engine.walk_forward_runner.fold_info) > 0 + assert any( + (info.n_trainable is not None) and (info.n_trainable > 0) + for info in engine.walk_forward_runner.fold_info + ) + + +def test_crypto_strategy_pipeline_native_selector_and_learner(crypto_panel: pd.DataFrame): + data = crypto_panel.copy() + + pipeline = Pipeline( + steps=[ + ( + "returns", + Returns(method="pct", input_col="close", output_col="ret", lags=1), + ), + ( + "trend", + Trend( + method="price_momentum", + input_col="close", + window_size=5, + scale=False, + ), + ), + ( + "volatility", + Vol( + method="std", + input_col="ret", + window_type="rolling", + window_size=20, + annualize=False, + ), + ), + ( + "liquidity", + Liquidity( + method="amihud", + return_col="ret", + price_col="close", + volume_col="volume", + ), + ), + ( + "selector", + FeatureSelector( + method="spearman", + feature_cols=["PriceMomentum_5", "STD_rolling_20", "AmihudIlliquidity"], + n_features=2, + drop_unselected=True, + ), + ), + ( + "learner", + RegressionLearner( + method="ridge", + feature_cols=None, + target_col=None, + output_col="forecast", + exclude_cols=["close", "volume", "funding_rate", "ret"], + alpha=1.0, + ), + ), + ( + "signal", + SignalGenerator( + signal_type="sign", + input_col="forecast", + output_col="signal", + ), + ), + ] + ) + + learning_spec = LearningSpec( + splitter=ExpandingIncrementPanelSplit( + train_intervals=1, + test_size=1, + min_train_periods=45, + lookahead=1, + embargo=0, + ), + target_spec=ForwardTargetSpec( + input_col="close", + output_col="target", + horizon=1, + kind="return", + method="pct", + group_level=1, + ), + date_level=0, + show_progress=False, + ) + + strategy = StrategySpec( + name="CryptoPipelineNativeSelectorLearner", + data_pipeline=pipeline, + optimizer=SignalWeighted(window_size=20), + cost_model=FixedCommissionModel(rate=0.0005), + learning_spec=learning_spec, + rebal_freq="d", + ) + + engine = BacktestEngine( + config=strategy, + data=data, + initial_capital=1_000_000.0, + verbose=False, + ann_factor=365, + ) + + results = engine.run_backtest() + assert results + assert "account_value" in results + assert "weights" in results + assert engine.walk_forward_runner is not None + assert len(engine.walk_forward_runner.fold_info) > 0 + assert results["account_value"].iloc[-1] > 0 + + strategy_returns = engine.portfolio_return.dropna().to_frame("strategy") + perf_table = Performance(strategy_returns, ret_type="simple", ann_factor=365).get_table(metrics="key_metrics") + assert not perf_table.empty + assert "Sharpe ratio" in perf_table.columns diff --git a/tests/learning/test_factor_mimicking.py b/tests/learning/test_factor_mimicking.py new file mode 100644 index 0000000..4e8354a --- /dev/null +++ b/tests/learning/test_factor_mimicking.py @@ -0,0 +1,61 @@ +import numpy as np +import pandas as pd +import pytest + +from factorlab.portfolio.construction.factor_mimicking import FMP + + +def _make_inputs(n: int = 120, seed: int = 17): + rng = np.random.default_rng(seed) + idx = pd.date_range("2018-01-31", periods=n, freq="ME") + + factors = pd.DataFrame( + { + "factor_a": rng.normal(scale=0.04, size=n), + "factor_b": rng.normal(scale=0.03, size=n), + "factor_c": rng.normal(scale=0.05, size=n), + }, + index=idx, + ) + + returns = pd.DataFrame( + { + "asset_1": 0.3 * factors["factor_a"] + rng.normal(scale=0.02, size=n), + "asset_2": 0.4 * factors["factor_b"] + rng.normal(scale=0.02, size=n), + "asset_3": 0.2 * factors["factor_c"] + rng.normal(scale=0.02, size=n), + }, + index=idx, + ) + + return returns, factors + + +def test_factor_mimicking_predict_and_exposure_runs(): + returns, factors = _make_inputs() + fmp = FMP(returns=returns, factors=factors, ann_factor=12) + + fmp.predict_factors( + window_type="fixed", + ann_vol=0.15, + periods=12, + n_lags=6, + fwd=0, + ) + assert not fmp.factors_pred.empty + assert set(fmp.factors_pred.columns).issubset(set(factors.columns)) + + fmp.factor_exposures() + assert fmp.betas is not None + assert "const" in fmp.betas.columns + assert set(factors.columns).issubset(set(fmp.betas.columns)) + + +def test_expanding_linear_prediction_is_strictly_one_step_ahead(): + idx = pd.date_range("2020-01-01", periods=10, freq="D") + features = pd.DataFrame({"x": np.arange(len(idx), dtype=float)}, index=idx) + target = pd.Series(np.arange(len(idx), dtype=float), index=idx, name="target") + + pred = FMP._expanding_linear_prediction(target=target, features=features, min_obs=3)["pred"] + + assert pd.isna(pred.iloc[2]) + assert pred.iloc[3] == pytest.approx(3.0) diff --git a/tests/learning/test_feature_selector.py b/tests/learning/test_feature_selector.py new file mode 100644 index 0000000..329a959 --- /dev/null +++ b/tests/learning/test_feature_selector.py @@ -0,0 +1,189 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import LinearRegression + +from factorlab.core.pipeline import Pipeline +from factorlab.learning import FeatureSelector +from factorlab.learning.sklearn_wrapper import SKLearnWrapper + + +def _make_df(n: int = 120, seed: int = 7) -> pd.DataFrame: + rng = np.random.default_rng(seed) + idx = pd.date_range("2021-01-01", periods=n, freq="D") + f1 = rng.normal(size=n) + f2 = rng.normal(size=n) + noise = rng.normal(scale=0.05, size=n) + target = 0.8 * f1 + 0.1 * noise + return pd.DataFrame({"f1": f1, "f2": f2, "target": target}, index=idx) + + +def test_feature_selector_spearman_selects_informative_feature(): + df = _make_df() + selector = FeatureSelector( + method="spearman", + feature_cols=["f1", "f2"], + n_features=1, + drop_unselected=True, + ) + selector.fit(df, y=df["target"]) + out = selector.transform(df) + + assert selector.selected_features_ == ["f1"] + assert "f1" in out.columns + assert "f2" not in out.columns + assert "target" in out.columns + + +def test_feature_selector_variance_works_without_target(): + idx = pd.date_range("2022-01-01", periods=40, freq="D") + df = pd.DataFrame( + { + "f1": np.linspace(0.0, 1.0, len(idx)), + "f2": 1.0, + "target": np.linspace(0.0, 1.0, len(idx)), + }, + index=idx, + ) + selector = FeatureSelector( + method="variance", + feature_cols=["f1", "f2"], + n_features=1, + drop_unselected=True, + ) + selector.fit(df) + out = selector.transform(df) + + assert selector.selected_features_ == ["f1"] + assert "f2" not in out.columns + + +def test_pipeline_feature_selector_with_auto_sklearn_features(): + df = _make_df() + pipeline = Pipeline( + steps=[ + ( + "selector", + FeatureSelector( + method="spearman", + feature_cols=["f1", "f2"], + target_col="target", + n_features=1, + drop_unselected=True, + ), + ), + ( + "learner", + SKLearnWrapper( + model=LinearRegression(), + feature_cols=None, + target_col="target", + output_col="forecast", + ), + ), + ] + ) + out = pipeline.fit_transform(df) + + assert "forecast" in out.columns + assert out["forecast"].notna().sum() > 0 + + +def test_feature_selector_supports_legacy_alias_names(): + df = _make_df() + selector = FeatureSelector( + method="spearman_rank", + feature_cols=["f1", "f2"], + target_col="target", + n_features=1, + ) + selector.fit(df) + assert selector.method == "spearman" + assert selector.selected_features_ == ["f1"] + + +def test_feature_selector_categorical_association_methods(): + df = _make_df() + for method in ("cramer_v", "tschuprow", "pearson_cc", "chi2"): + selector = FeatureSelector( + method=method, + feature_cols=["f1", "f2"], + target_col="target", + n_features=1, + feature_bins=4, + target_bins=3, + ) + selector.fit(df) + assert len(selector.selected_features_) == 1 + assert selector.selected_features_[0] in {"f1", "f2"} + + +def test_feature_selector_model_methods(): + df = _make_df() + methods = [ + ("lars", {}), + ("lasso", {"alpha": 0.01}), + ("ridge", {"alpha": 1.0}), + ("elastic_net", {"alpha": 0.01, "l1_ratio": 0.5}), + ("random_forest", {"n_estimators": 20, "max_depth": 3}), + ("rfe", {}), + ("forward", {}), + ("backward", {}), + ("stepwise", {}), + ("exhaustive", {}), + ] + for method, kwargs in methods: + selector = FeatureSelector( + method=method, + feature_cols=["f1", "f2"], + target_col="target", + n_features=1, + method_kwargs=kwargs, + ) + selector.fit(df) + assert selector.selected_features_ == ["f1"] + + +def test_feature_selector_xgboost_optional_dependency(): + df = _make_df() + selector = FeatureSelector( + method="xgboost", + feature_cols=["f1", "f2"], + target_col="target", + n_features=1, + method_kwargs={"n_estimators": 20, "max_depth": 3}, + ) + try: + selector.fit(df) + except ImportError: + pytest.skip("xgboost is not installed in test environment.") + assert selector.selected_features_ == ["f1"] + + +def test_feature_selector_catboost_optional_dependency(): + df = _make_df() + selector = FeatureSelector( + method="catboost", + feature_cols=["f1", "f2"], + target_col="target", + n_features=1, + method_kwargs={"iterations": 25, "depth": 3, "learning_rate": 0.2}, + ) + try: + selector.fit(df) + except ImportError: + pytest.skip("catboost is not installed in test environment.") + assert selector.selected_features_ == ["f1"] + + +def test_feature_selector_redundancy_methods(): + df = _make_df() + for method in ("mrmr", "mifs", "mrmr_mifs", "spearman_mrmr"): + selector = FeatureSelector( + method=method, + feature_cols=["f1", "f2"], + target_col="target", + n_features=1, + ) + selector.fit(df) + assert selector.selected_features_ == ["f1"] diff --git a/tests/learning/test_forward_target_spec.py b/tests/learning/test_forward_target_spec.py new file mode 100644 index 0000000..a4b13c7 --- /dev/null +++ b/tests/learning/test_forward_target_spec.py @@ -0,0 +1,56 @@ +import numpy as np +import pandas as pd + +from factorlab.targets import ForwardDirectionTarget, ForwardTargetSpec + + +def _make_panel(n_days: int = 15, tickers: tuple[str, ...] = ("BTC", "ETH"), seed: int = 9) -> pd.DataFrame: + rng = np.random.default_rng(seed) + dates = pd.date_range("2022-01-01", periods=n_days, freq="D") + idx = pd.MultiIndex.from_product([dates, tickers], names=["date", "ticker"]) + close = 100.0 + np.cumsum(rng.normal(0.0, 1.0, size=len(idx))) + return pd.DataFrame({"close": close}, index=idx).sort_index() + + +def test_forward_target_spec_trainable_mask_excludes_labels_past_train_end(): + X = _make_panel() + spec = ForwardTargetSpec(input_col="close", output_col="target", horizon=2, group_level=1) + + unique_dates = X.index.get_level_values("date").unique().sort_values() + train_dates = unique_dates[:10] + train_index = X.loc[(slice(train_dates.min(), train_dates.max()), slice(None)), :].index + + mask = spec.trainable_mask(index=X.index, train_index=train_index, date_level=0) + label_end = spec.label_end_dates(index=X.index, date_level=0) + train_end = pd.Timestamp(train_dates.max()) + + assert mask.dtype == bool + assert int(mask.sum()) > 0 + assert (pd.DatetimeIndex(label_end[mask].values) <= train_end).all() + + leaky_rows = ( + pd.Index(X.index).isin(train_index) + & label_end.notna().to_numpy() + & (pd.DatetimeIndex(label_end.values) > train_end) + ) + assert not mask[leaky_rows].any() + + +def test_forward_direction_target_preserves_nan_tail_labels(): + X = _make_panel(n_days=8, tickers=("BTC",)) + target = ForwardDirectionTarget( + input_col="close", + output_col="direction", + horizon=2, + threshold=0.0, + group_level=1, + ) + target.fit(X) + out = target.transform(X)["direction"] + + unique_dates = out.index.get_level_values("date").unique().sort_values() + tail_dates = unique_dates[-2:] + tail_mask = out.index.get_level_values("date").isin(tail_dates) + + assert out[tail_mask].isna().all() + assert out[~tail_mask].notna().any() diff --git a/tests/learning/test_import_paths.py b/tests/learning/test_import_paths.py new file mode 100644 index 0000000..57cce87 --- /dev/null +++ b/tests/learning/test_import_paths.py @@ -0,0 +1,29 @@ +import subprocess +import sys + + +def test_targets_import_in_fresh_interpreter(): + code = "import factorlab.targets as targets; import factorlab.targets.forward as forward" + proc = subprocess.run( + [sys.executable, "-c", code], + capture_output=True, + text=True, + check=False, + ) + assert proc.returncode == 0, proc.stderr + + +def test_learning_search_exports_available_via_package_import(): + code = ( + "from factorlab.learning import WalkForwardGridSearch, parameter_grid, set_pipeline_params; " + "assert WalkForwardGridSearch is not None; " + "assert callable(parameter_grid); " + "assert callable(set_pipeline_params)" + ) + proc = subprocess.run( + [sys.executable, "-c", code], + capture_output=True, + text=True, + check=False, + ) + assert proc.returncode == 0, proc.stderr diff --git a/tests/learning/test_learners.py b/tests/learning/test_learners.py new file mode 100644 index 0000000..ac8402e --- /dev/null +++ b/tests/learning/test_learners.py @@ -0,0 +1,258 @@ +import numpy as np +import pandas as pd +import pytest + +from factorlab.learning import ( + CatBoostClassifierLearner, + CatBoostRegressorLearner, + ClassificationLearner, + LinearRegressionLearner, + LogisticRegressionCVLearner, + RandomForestRegressorLearner, + RegressionLearner, + SupervisedPCALearner, + TorchRegressorLearner, + XGBoostClassifierLearner, + XGBoostRegressorLearner, +) + + +def _make_regression_df(n: int = 160, seed: int = 21) -> pd.DataFrame: + rng = np.random.default_rng(seed) + idx = pd.date_range("2020-01-01", periods=n, freq="D") + x1 = rng.normal(size=n) + x2 = rng.normal(size=n) + y = 0.7 * x1 - 0.2 * x2 + rng.normal(scale=0.05, size=n) + return pd.DataFrame({"x1": x1, "x2": x2, "target": y}, index=idx) + + +def _make_classification_df(n: int = 180, seed: int = 44) -> pd.DataFrame: + rng = np.random.default_rng(seed) + idx = pd.date_range("2020-01-01", periods=n, freq="D") + x1 = rng.normal(size=n) + x2 = rng.normal(size=n) + latent = 0.9 * x1 - 0.5 * x2 + rng.normal(scale=0.2, size=n) + y = (latent > 0).astype(int) + return pd.DataFrame({"x1": x1, "x2": x2, "target": y}, index=idx) + + +@pytest.mark.parametrize( + "method,kwargs,n_features", + [ + ("linear_regression", {}, None), + ("ridge", {"alpha": 1.0}, None), + ("lasso", {"alpha": 0.01}, None), + ("elastic_net", {"alpha": 0.01, "l1_ratio": 0.5}, None), + ("lars", {}, 1), + ("ridge_cv", {}, None), + ("random_forest", {"n_estimators": 20, "max_depth": 4}, None), + ], +) +def test_regression_learner_methods_fit_transform(method: str, kwargs: dict, n_features: int | None): + df = _make_regression_df() + learner = RegressionLearner( + method=method, + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + n_features=n_features, + **kwargs, + ) + learner.fit(df) + out = learner.transform(df) + + assert "forecast" in out.columns + assert out["forecast"].notna().sum() == len(df) + + +@pytest.mark.parametrize( + "method,kwargs,prediction_method", + [ + ("logistic_regression", {}, "predict_proba"), + ("logistic_regression_cv", {"cv": 3}, "predict_proba"), + ("random_forest", {"n_estimators": 20, "max_depth": 4}, "predict_proba"), + ("random_forest", {"n_estimators": 20, "max_depth": 4}, "predict"), + ], +) +def test_classification_learner_methods_fit_transform( + method: str, + kwargs: dict, + prediction_method: str, +): + df = _make_classification_df() + learner = ClassificationLearner( + method=method, + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + prediction_method=prediction_method, + **kwargs, + ) + learner.fit(df) + out = learner.transform(df) + + assert "forecast" in out.columns + valid = out["forecast"].dropna() + assert len(valid) == len(df) + if prediction_method == "predict_proba": + assert ((valid >= 0.0) & (valid <= 1.0)).all() + else: + assert set(valid.unique()).issubset({0.0, 1.0}) + + +def test_linear_regression_learner_fit_transform(): + df = _make_regression_df() + learner = LinearRegressionLearner( + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + ) + learner.fit(df) + out = learner.transform(df) + + assert "forecast" in out.columns + assert out["forecast"].notna().sum() == len(df) + + +def test_random_forest_learner_fit_transform(): + df = _make_regression_df() + learner = RandomForestRegressorLearner( + n_estimators=20, + max_depth=4, + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + ) + learner.fit(df) + out = learner.transform(df) + + assert "forecast" in out.columns + assert out["forecast"].notna().sum() == len(df) + + +def test_logistic_regression_cv_learner_fit_transform(): + df = _make_classification_df() + learner = LogisticRegressionCVLearner( + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + cv=3, + ) + learner.fit(df) + out = learner.transform(df) + + assert "forecast" in out.columns + valid = out["forecast"].dropna() + assert len(valid) == len(df) + assert ((valid >= 0.0) & (valid <= 1.0)).all() + + +def test_torch_learner_optional_dependency(): + df = _make_regression_df() + learner = TorchRegressorLearner( + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + epochs=2, + ) + try: + learner.fit(df) + except ImportError: + pytest.skip("torch is not installed in test environment.") + out = learner.transform(df) + assert "forecast" in out.columns + + +def test_supervised_pca_learner_fit_transform(): + df = _make_regression_df(n=200, seed=42) + df["x3"] = 0.5 * df["x1"] + 0.5 * df["x2"] + np.random.default_rng(1).normal(scale=0.01, size=len(df)) + + learner = SupervisedPCALearner( + feature_cols=["x1", "x2", "x3"], + target_col="target", + output_col="forecast", + selection_method="lasso", + n_features=2, + n_components=1, + ) + learner.fit(df) + out = learner.transform(df) + + assert "forecast" in out.columns + assert out["forecast"].notna().sum() == len(df) + + +def test_xgboost_regressor_learner_optional_dependency(): + df = _make_regression_df() + try: + learner = XGBoostRegressorLearner( + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + n_estimators=20, + max_depth=3, + ) + except ImportError: + pytest.skip("xgboost is not installed in test environment.") + + learner.fit(df) + out = learner.transform(df) + assert out["forecast"].notna().sum() == len(df) + + +def test_xgboost_classifier_learner_optional_dependency(): + df = _make_classification_df() + try: + learner = XGBoostClassifierLearner( + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + n_estimators=20, + max_depth=3, + ) + except ImportError: + pytest.skip("xgboost is not installed in test environment.") + + learner.fit(df) + out = learner.transform(df) + valid = out["forecast"].dropna() + assert len(valid) == len(df) + assert ((valid >= 0.0) & (valid <= 1.0)).all() + + +def test_catboost_regressor_learner_optional_dependency(): + df = _make_regression_df() + try: + learner = CatBoostRegressorLearner( + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + iterations=30, + depth=3, + ) + except ImportError: + pytest.skip("catboost is not installed in test environment.") + + learner.fit(df) + out = learner.transform(df) + assert out["forecast"].notna().sum() == len(df) + + +def test_catboost_classifier_learner_optional_dependency(): + df = _make_classification_df() + try: + learner = CatBoostClassifierLearner( + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + iterations=30, + depth=3, + ) + except ImportError: + pytest.skip("catboost is not installed in test environment.") + + learner.fit(df) + out = learner.transform(df) + valid = out["forecast"].dropna() + assert len(valid) == len(df) + assert ((valid >= 0.0) & (valid <= 1.0)).all() diff --git a/tests/learning/test_search.py b/tests/learning/test_search.py new file mode 100644 index 0000000..7a6d07a --- /dev/null +++ b/tests/learning/test_search.py @@ -0,0 +1,60 @@ +import numpy as np +import pandas as pd + +from factorlab.core.pipeline import Pipeline +from factorlab.learning import ( + ExpandingKFoldPanelSplit, + LinearRegressionLearner, + WalkForwardGridSearch, +) + + +def _make_panel(seed: int = 7) -> tuple[pd.DataFrame, pd.Series]: + rng = np.random.default_rng(seed) + dates = pd.date_range("2021-01-01", periods=80, freq="D") + tickers = ["BTC", "ETH", "SOL"] + idx = pd.MultiIndex.from_product([dates, tickers], names=["date", "ticker"]) + + x1 = rng.normal(size=len(idx)) + x2 = rng.normal(size=len(idx)) + noise = rng.normal(scale=0.05, size=len(idx)) + y = 0.25 * x1 + 1.1 * x2 + noise + + X = pd.DataFrame({"x1": x1, "x2": x2}, index=idx) + y_series = pd.Series(y, index=idx, name="target") + return X, y_series + + +def test_walk_forward_grid_search_runs_and_ranks(): + X, y = _make_panel() + + pipeline = Pipeline( + [ + ( + "lr", + LinearRegressionLearner( + feature_cols=["x1", "x2"], + output_col="forecast", + ), + ) + ] + ) + splitter = ExpandingKFoldPanelSplit(n_splits=3, lookahead=0, embargo=0) + + def scorer(out: pd.DataFrame) -> float: + joined = pd.concat([out["forecast"], y], axis=1).dropna() + mse = ((joined.iloc[:, 0] - joined.iloc[:, 1]) ** 2).mean() + return -float(mse) + + search = WalkForwardGridSearch( + pipeline=pipeline, + splitter=splitter, + y=y, + param_grid={"lr__exclude_cols": [[], ["x2"]]}, + scorer=scorer, + ) + res = search.run(X) + + assert not res.empty + assert {"score", "n_folds", "lr__exclude_cols"}.issubset(res.columns) + assert res.iloc[0]["score"] >= res.iloc[-1]["score"] diff --git a/tests/learning/test_splitters.py b/tests/learning/test_splitters.py new file mode 100644 index 0000000..5f935c9 --- /dev/null +++ b/tests/learning/test_splitters.py @@ -0,0 +1,112 @@ +import numpy as np +import pandas as pd +import pytest + +from factorlab.learning import ( + ExpandingFrequencyPanelSplit, + ExpandingIncrementPanelSplit, + ExpandingKFoldPanelSplit, +) + + +def _make_xy(n_days: int = 180, n_assets: int = 4, seed: int = 123): + rng = np.random.default_rng(seed) + dates = pd.date_range("2020-01-01", periods=n_days, freq="D") + tickers = [f"A{i}" for i in range(n_assets)] + idx = pd.MultiIndex.from_product([dates, tickers], names=["date", "ticker"]) + + X = pd.DataFrame( + { + "f1": rng.normal(size=len(idx)), + "f2": rng.normal(size=len(idx)), + }, + index=idx, + ) + y = pd.Series(rng.normal(size=len(idx)), index=idx, name="y") + return X, y + + +def test_expanding_kfold_time_order_and_purge(): + X, y = _make_xy() + splitter = ExpandingKFoldPanelSplit(n_splits=4, lookahead=1, embargo=1) + splits = list(splitter.split(X, y)) + + assert len(splits) > 0 + for train_idx, test_idx in splits: + train_dates = X.iloc[train_idx].index.get_level_values("date") + test_dates = X.iloc[test_idx].index.get_level_values("date") + assert train_dates.max() < test_dates.min() + + +def test_expanding_increment_respects_lookahead_embargo_boundary(): + X, y = _make_xy() + splitter = ExpandingIncrementPanelSplit( + train_intervals=10, + test_size=5, + min_train_periods=50, + lookahead=2, + embargo=3, + ) + splits = list(splitter.split(X, y)) + + assert len(splits) > 0 + for train_idx, test_idx in splits: + train_dates = pd.DatetimeIndex(X.iloc[train_idx].index.get_level_values("date")) + test_dates = pd.DatetimeIndex(X.iloc[test_idx].index.get_level_values("date")) + gap_days = (test_dates.min() - train_dates.max()).days + assert gap_days >= 5 + + +def test_expanding_frequency_produces_non_empty_test_folds(): + X, y = _make_xy(n_days=365) + splitter = ExpandingFrequencyPanelSplit( + expansion_freq="M", + test_freq="M", + min_train_periods=60, + lookahead=1, + embargo=1, + ) + splits = list(splitter.split(X, y)) + assert len(splits) > 0 + assert all(len(test_idx) > 0 for _, test_idx in splits) + + +def test_splitter_rejects_unsorted_index(): + X, y = _make_xy() + shuffled = np.random.default_rng(1).permutation(len(X)) + X_unsorted = X.iloc[shuffled] + y_unsorted = y.iloc[shuffled] + + splitter = ExpandingIncrementPanelSplit( + train_intervals=5, + test_size=3, + min_train_periods=30, + lookahead=1, + embargo=0, + ) + with pytest.raises(ValueError, match="index must be sorted"): + _ = list(splitter.split(X_unsorted, y_unsorted)) + + +def test_splitter_indices_align_to_original_rows_after_nan_drop(): + X, y = _make_xy(n_days=140, n_assets=3) + y = y.copy() + drop_dates = X.index.get_level_values("date").unique()[:8] + drop_mask = X.index.get_level_values("date").isin(drop_dates) + y.loc[drop_mask] = np.nan + + splitter = ExpandingIncrementPanelSplit( + train_intervals=10, + test_size=5, + min_train_periods=40, + lookahead=1, + embargo=0, + drop_nas=True, + ) + + valid_rows = y.notna().to_numpy() + splits = list(splitter.split(X, y)) + assert len(splits) > 0 + for train_idx, test_idx in splits: + selected = np.concatenate([train_idx, test_idx]) + assert valid_rows[selected].all() diff --git a/tests/learning/test_temporal_leakage_guards.py b/tests/learning/test_temporal_leakage_guards.py new file mode 100644 index 0000000..8d3c485 --- /dev/null +++ b/tests/learning/test_temporal_leakage_guards.py @@ -0,0 +1,47 @@ +import pandas as pd +import numpy as np + +from factorlab.learning import ExpandingIncrementPanelSplit + + +def _make_xy(n_days: int = 200, n_assets: int = 3, seed: int = 42): + rng = np.random.default_rng(seed) + dates = pd.date_range("2020-01-01", periods=n_days, freq="D") + tickers = [f"A{i}" for i in range(n_assets)] + idx = pd.MultiIndex.from_product([dates, tickers], names=["date", "ticker"]) + X = pd.DataFrame({"f1": rng.normal(size=len(idx))}, index=idx) + y = pd.Series(rng.normal(size=len(idx)), index=idx, name="y") + return X, y + + +def test_fold_boundary_respects_lookahead_and_embargo(): + X, y = _make_xy() + lookahead = 2 + embargo = 3 + splitter = ExpandingIncrementPanelSplit( + train_intervals=5, + test_size=3, + min_train_periods=60, + lookahead=lookahead, + embargo=embargo, + ) + for train_idx, test_idx in splitter.split(X, y): + train_dates = pd.DatetimeIndex(X.iloc[train_idx].index.get_level_values("date")) + test_dates = pd.DatetimeIndex(X.iloc[test_idx].index.get_level_values("date")) + assert train_dates.max() < test_dates.min() + assert (test_dates.min() - train_dates.max()).days >= lookahead + embargo + + +def test_horizon_purge_no_train_leakage_into_test_start(): + X, y = _make_xy() + splitter = ExpandingIncrementPanelSplit( + train_intervals=1, + test_size=1, + min_train_periods=40, + lookahead=1, + embargo=0, + ) + for train_idx, test_idx in splitter.split(X, y): + train_dates = pd.DatetimeIndex(X.iloc[train_idx].index.get_level_values("date")) + test_dates = pd.DatetimeIndex(X.iloc[test_idx].index.get_level_values("date")) + assert train_dates.max() <= test_dates.min() - pd.Timedelta(days=1) diff --git a/tests/learning/test_time_series.py b/tests/learning/test_time_series.py new file mode 100644 index 0000000..aa626e6 --- /dev/null +++ b/tests/learning/test_time_series.py @@ -0,0 +1,100 @@ +import numpy as np +import pandas as pd + +from factorlab.learning import ( + LagFeatures, + StatsmodelsOLSLearner, + TimeSeriesAnalysis, + TimeSeriesDiagnostics, + add_lags, + expanding_window, + rolling_window, +) + + +def _make_ts_df(n: int = 180, seed: int = 17) -> pd.DataFrame: + rng = np.random.default_rng(seed) + idx = pd.date_range("2020-01-01", periods=n, freq="D") + x1 = rng.normal(size=n) + x2 = rng.normal(size=n) + target = 0.6 * x1 - 0.3 * x2 + rng.normal(scale=0.1, size=n) + return pd.DataFrame({"x1": x1, "x2": x2, "target": target}, index=idx) + + +def test_add_lags_single_index(): + df = _make_ts_df(n=20) + out = add_lags(df[["x1"]], n_lags=2, include_original=True) + assert "x1" in out.columns + assert "x1_L1" in out.columns + assert "x1_L2" in out.columns + + +def test_lag_features_transform(): + df = _make_ts_df(n=20) + tr = LagFeatures(input_cols=["x1", "x2"], n_lags=2) + tr.fit(df) + out = tr.transform(df) + assert "x1_L1" in out.columns + assert "x2_L2" in out.columns + + +def test_statsmodels_ols_learner_fit_transform(): + df = _make_ts_df() + learner = StatsmodelsOLSLearner( + feature_cols=["x1", "x2"], + target_col="target", + output_col="forecast", + ) + learner.fit(df) + out = learner.transform(df) + assert "forecast" in out.columns + assert out["forecast"].notna().sum() == len(df) + assert learner.model_result is not None + + +def test_time_series_diagnostics_outputs(): + df = _make_ts_df() + adf = TimeSeriesDiagnostics.adf_test(df[["x1", "x2"]]) + assert {"adf_stat", "p_value", "used_lag", "n_obs"}.issubset(adf.columns) + assert "x1" in adf.index + + gc = TimeSeriesDiagnostics.granger_causality( + target=df["target"], + features=df[["x1", "x2"]], + max_lag=3, + verbose=False, + ) + assert {"min_p_value", "best_lag"}.issubset(gc.columns) + assert "x1" in gc.index + + +def test_time_series_analysis_linear_regression_params_and_resid(): + df = _make_ts_df(n=120) + tsa = TimeSeriesAnalysis( + target=df["target"], + features=df[["x1", "x2"]], + trend="c", + window_type="rolling", + window_size=30, + ) + params = tsa.linear_regression(output="params") + resid = tsa.linear_regression(output="resid") + + assert {"const", "x1", "x2"}.issubset(params.columns) + assert "resid" in resid.columns + assert resid["resid"].notna().sum() > 0 + + +def test_rolling_and_expanding_window_helpers(): + s = pd.Series(np.arange(10, dtype=float), index=pd.date_range("2021-01-01", periods=10, freq="D")) + + def win_mean(x): + return pd.Series({"mean": float(x.mean())}) + + roll = rolling_window(win_mean, s, window_size=4) + exp = expanding_window(win_mean, s, min_obs=4) + + assert isinstance(roll, pd.DataFrame) + assert isinstance(exp, pd.DataFrame) + assert roll.shape[0] == 7 + assert exp.shape[0] == 7 diff --git a/tests/learning/test_unsupervised_transforms.py b/tests/learning/test_unsupervised_transforms.py new file mode 100644 index 0000000..157ecb5 --- /dev/null +++ b/tests/learning/test_unsupervised_transforms.py @@ -0,0 +1,62 @@ +import numpy as np +import pandas as pd + +from factorlab.learning import PCATransform, PPCATransform, R2PCATransform + + +def _make_df(n: int = 120, seed: int = 11) -> pd.DataFrame: + rng = np.random.default_rng(seed) + idx = pd.date_range("2020-01-01", periods=n, freq="D") + x1 = rng.normal(size=n) + x2 = 0.5 * x1 + rng.normal(scale=0.5, size=n) + x3 = -0.25 * x1 + 0.75 * x2 + rng.normal(scale=0.5, size=n) + df = pd.DataFrame({"x1": x1, "x2": x2, "x3": x3}, index=idx) + df.loc[idx[5], "x2"] = np.nan + df.loc[idx[11], "x3"] = np.nan + return df + + +def test_pca_transform_appends_components(): + df = _make_df() + tr = PCATransform(input_cols=["x1", "x2", "x3"], n_components=2, output_prefix="PC") + tr.fit(df) + out = tr.transform(df) + + assert {"PC1", "PC2"}.issubset(out.columns) + assert out[["PC1", "PC2"]].notna().sum().min() > 0 + assert tr.eigenvectors_ is not None + assert tr.explained_variance_ratio_ is not None + + +def test_ppca_transform_append_and_shape(): + df = _make_df(n=150, seed=17) + tr = PPCATransform( + input_cols=["x1", "x2", "x3"], + n_components=2, + min_obs=20, + min_feat=2, + output_prefix="PPCA", + ) + tr.fit(df) + out = tr.transform(df) + + assert {"PPCA1", "PPCA2"}.issubset(out.columns) + assert out[["PPCA1", "PPCA2"]].shape[0] == len(df) + assert tr.eigenvectors_ is not None + assert tr.explained_variance_ratio_ is not None + + +def test_r2pca_transform_can_return_components_only(): + df = _make_df(n=90, seed=23).fillna(0.0) + tr = R2PCATransform( + input_cols=["x1", "x2", "x3"], + n_components=2, + output_prefix="R2PC", + append=False, + random_state=0, + ) + tr.fit(df) + pcs = tr.transform(df) + + assert list(pcs.columns) == ["R2PC1", "R2PC2"] + assert pcs.shape == (len(df), 2) diff --git a/tests/learning/test_walk_forward_learner.py b/tests/learning/test_walk_forward_learner.py new file mode 100644 index 0000000..acf8224 --- /dev/null +++ b/tests/learning/test_walk_forward_learner.py @@ -0,0 +1,117 @@ +import numpy as np +import pandas as pd +from sklearn.linear_model import LinearRegression + +from factorlab.core.pipeline import Pipeline +from factorlab.factors.library.trend.trend import Trend +from factorlab.learning import WalkForwardLearner +from factorlab.signals.generator import SignalGenerator +from factorlab.targets import ForwardReturnTarget + + +def _make_panel( + n_days: int = 220, + tickers: tuple[str, ...] = ("BTC", "ETH", "SOL", "XRP"), + seed: int = 7, +) -> pd.DataFrame: + rng = np.random.default_rng(seed) + dates = pd.date_range("2021-01-01", periods=n_days, freq="D") + index = pd.MultiIndex.from_product([dates, tickers], names=["date", "ticker"]) + df = pd.DataFrame(index=index, columns=["close"], dtype=float) + + for i, ticker in enumerate(tickers): + ticker_mask = df.index.get_level_values("ticker") == ticker + drift = 0.0003 + (i * 0.0001) + ret = rng.normal(loc=drift, scale=0.02, size=n_days) + close = 100.0 * np.exp(np.cumsum(ret)) + df.loc[ticker_mask, "close"] = close + + return df.sort_index() + + +def _make_pipeline(include_signal: bool = True) -> Pipeline: + steps = [ + ( + "trend_factor", + Trend( + method="price_momentum", + input_col="close", + window_size=5, + scale=False, + ), + ), + ( + "forecaster", + WalkForwardLearner( + model=LinearRegression(), + feature_cols=["PriceMomentum_5"], + target_transform=ForwardReturnTarget( + input_col="close", + output_col="fwd_ret", + horizon=1, + ), + prediction_col="forecast", + window_type="expanding", + min_train_periods=40, + retrain_interval=1, + min_train_samples=100, + ), + ), + ] + + if include_signal: + steps.append( + ( + "signal", + SignalGenerator( + signal_type="sign", + input_col="forecast", + output_col="signal", + ), + ) + ) + + return Pipeline(steps=steps) + + +def test_pipeline_factor_to_forecast_to_signal_chain(): + df = _make_panel() + pipeline = _make_pipeline(include_signal=True) + + out = pipeline.fit_transform(df) + + assert "PriceMomentum_5" in out.columns + assert "forecast" in out.columns + assert "signal" in out.columns + + assert out["forecast"].notna().sum() > 0 + signal_values = out["signal"].dropna().unique().tolist() + assert set(signal_values).issubset({-1.0, 0.0, 1.0}) + + +def test_walk_forward_forecast_is_invariant_to_future_data_changes(): + df = _make_panel() + pipe_1 = _make_pipeline(include_signal=False) + out_1 = pipe_1.fit_transform(df) + + shock_date = df.index.get_level_values("date").unique()[160] + df_shocked = df.copy(deep=True) + future_mask = df_shocked.index.get_level_values("date") >= shock_date + df_shocked.loc[future_mask, "close"] = df_shocked.loc[future_mask, "close"] * 4.0 + + pipe_2 = _make_pipeline(include_signal=False) + out_2 = pipe_2.fit_transform(df_shocked) + + pre_shock_mask = out_1.index.get_level_values("date") < shock_date + s1 = out_1.loc[pre_shock_mask, "forecast"] + s2 = out_2.loc[pre_shock_mask, "forecast"] + + assert s1.notna().sum() > 0 + pd.testing.assert_series_equal( + s1, + s2, + check_names=False, + check_exact=False, + rtol=1e-12, + atol=1e-12, + ) diff --git a/tests/learning/test_walk_forward_runner.py b/tests/learning/test_walk_forward_runner.py new file mode 100644 index 0000000..3b5cc40 --- /dev/null +++ b/tests/learning/test_walk_forward_runner.py @@ -0,0 +1,165 @@ +import pandas as pd +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression + +from factorlab.core.pipeline import Pipeline +from factorlab.core.walk_forward_runner import WalkForwardRunner +from factorlab.features.transforms.returns import Returns +from factorlab.learning import ExpandingIncrementPanelSplit +from factorlab.learning.sklearn_wrapper import SKLearnWrapper +from factorlab.signals.generator import SignalGenerator +from factorlab.targets import ForwardReturnTarget, ForwardTargetSpec + + +def _make_panel(n_days: int = 220, tickers: tuple[str, ...] = ("BTC", "ETH", "SOL"), seed: int = 5) -> pd.DataFrame: + rng = np.random.default_rng(seed) + dates = pd.date_range("2021-01-01", periods=n_days, freq="D") + idx = pd.MultiIndex.from_product([dates, tickers], names=["date", "ticker"]) + df = pd.DataFrame(index=idx, columns=["close"], dtype=float) + for i, t in enumerate(tickers): + m = df.index.get_level_values("ticker") == t + ret = rng.normal(loc=0.0002 + i * 0.00005, scale=0.02, size=n_days) + df.loc[m, "close"] = 100.0 * np.exp(np.cumsum(ret)) + return df.sort_index() + + +def _make_pipeline() -> Pipeline: + return Pipeline( + steps=[ + ("ret", Returns(method="pct", input_col="close", output_col="ret", lags=1)), + ("target", ForwardReturnTarget(input_col="close", output_col="target", horizon=1)), + ( + "learner", + SKLearnWrapper( + model=LinearRegression(), + feature_cols=["ret"], + target_col="target", + output_col="forecast", + prediction_method="predict", + ), + ), + ("signal", SignalGenerator(signal_type="sign", input_col="forecast", output_col="signal")), + ] + ) + + +def _make_pipeline_external_target() -> Pipeline: + return Pipeline( + steps=[ + ("ret", Returns(method="pct", input_col="close", output_col="ret", lags=1)), + ( + "learner", + SKLearnWrapper( + model=LinearRegression(), + feature_cols=["ret"], + target_col=None, + output_col="forecast", + prediction_method="predict", + ), + ), + ("signal", SignalGenerator(signal_type="sign", input_col="forecast", output_col="signal")), + ] + ) + + +def test_walk_forward_runner_pipeline_chain(): + X = _make_panel() + splitter = ExpandingIncrementPanelSplit( + train_intervals=1, + test_size=1, + min_train_periods=45, + lookahead=1, + embargo=0, + ) + runner = WalkForwardRunner(pipeline=_make_pipeline(), splitter=splitter, date_level=0) + out = runner.run(X) + + assert "forecast" in out.columns + assert "signal" in out.columns + assert out["forecast"].notna().sum() > 0 + assert len(runner.fold_info) > 0 + + +def test_walk_forward_runner_future_perturbation_invariance(): + X = _make_panel() + splitter = ExpandingIncrementPanelSplit( + train_intervals=1, + test_size=1, + min_train_periods=50, + lookahead=1, + embargo=0, + ) + + runner_1 = WalkForwardRunner(pipeline=_make_pipeline(), splitter=splitter, date_level=0) + out_1 = runner_1.run(X) + + shock_date = X.index.get_level_values("date").unique()[170] + X_shock = X.copy(deep=True) + m = X_shock.index.get_level_values("date") >= shock_date + X_shock.loc[m, "close"] = X_shock.loc[m, "close"] * 5.0 + + runner_2 = WalkForwardRunner(pipeline=_make_pipeline(), splitter=splitter, date_level=0) + out_2 = runner_2.run(X_shock) + + pre = out_1.index.get_level_values("date") < shock_date + pd.testing.assert_series_equal( + out_1.loc[pre, "forecast"], + out_2.loc[pre, "forecast"], + check_names=False, + check_exact=False, + rtol=1e-12, + atol=1e-12, + ) + + +def test_walk_forward_runner_with_fold_local_target_spec(): + X = _make_panel() + splitter = ExpandingIncrementPanelSplit( + train_intervals=1, + test_size=1, + min_train_periods=50, + lookahead=1, + embargo=0, + ) + target_spec = ForwardTargetSpec( + input_col="close", + output_col="target", + horizon=1, + method="pct", + group_level=1, + ) + + runner = WalkForwardRunner( + pipeline=_make_pipeline_external_target(), + splitter=splitter, + target_spec=target_spec, + date_level=0, + ) + out = runner.run(X) + + assert "forecast" in out.columns + assert out["forecast"].notna().sum() > 0 + assert len(runner.fold_info) > 0 + assert all((fi.n_trainable is None) or (fi.n_trainable > 0) for fi in runner.fold_info) + + +def test_walk_forward_runner_raises_when_lookahead_shorter_than_horizon(): + X = _make_panel() + splitter = ExpandingIncrementPanelSplit( + train_intervals=1, + test_size=1, + min_train_periods=50, + lookahead=0, + embargo=0, + ) + target_spec = ForwardTargetSpec(input_col="close", output_col="target", horizon=1) + runner = WalkForwardRunner( + pipeline=_make_pipeline_external_target(), + splitter=splitter, + target_spec=target_spec, + date_level=0, + ) + + with pytest.raises(ValueError, match="lookahead is shorter than target_spec.horizon"): + runner.run(X)