diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..136c3028 --- /dev/null +++ b/.gitignore @@ -0,0 +1,225 @@ +#self files +.pdf + + + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +*.lcov +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi/* +!.pixi/config.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule* +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ +# Temporary file for partial code execution +tempCodeRunnerFile.py + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml \ No newline at end of file diff --git a/README.md b/README.md index d68e7c4e..bf6aa1b6 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,23 @@ # Hedge Fund Risk Modeling & Semi-Automated Trading System ## Team Information -- **Team Name**: [Team Name] -- **Year**: [Year] -- **All-Female Team**: [Yes/No] - +- **Team Name**: Paneer_Lovers +- **Year**: 2 +- **All-Female Team**: No ## Architecture Overview #### Describe your approach here. Keep it short and clear. - How does your system ingest and preprocess the varying data sources (market, macro, sentiment)? + The system uses `ingestion.py` to merge four datasets (Equity, Macro, Oil, Multi-Asset) via an inner join on the 'Date' index. `preprocessing.py` standardizes scales using Z-score Normalization and applies Winsorization (±3 sigma) to clip extreme outliers, preventing bias. We engineer features like Rolling Volatility, Momentum, and Cross-Asset Correlation. + - What risk modeling techniques were selected, and how are they integrated into the trading decision pipeline? + We implemented Historical VaR, Parametric VaR, Conditional VaR (Expected Shortfall), and Maximum Drawdown. These are integrated as "Safety Overlays." If 20-day rolling volatility breaches 25%, the system overrides our Machine Learning signals and executes an emergency "Risk-Off" shift into Gold and Cash. + - How does your semi-automated strategy generate signals while respecting portfolio constraints and handling realistic conditions like slippage? + An `MLSignalEngine` generates portfolio targets using a Random Forest model tuned via TimeSeriesSplit to prevent look-ahead bias. The Portfolio manager executes targets while enforcing `max_position_pct` constraints. It simulates 0.1% Commission and 0.05% Slippage, and utilizes a "Significance Filter" (>5% deviation) to avoid excessive trading fees. + - How is the dashboard designed to provide explainable insights and key metrics (Sharpe, drawdown) to stakeholders? + The interactive Streamlit dashboard provides live NAV Line Charts and Drawdown Visualizations compared to a Buy-and-Hold benchmark. It computes live risk-adjusted metrics (Sharpe, Sortino, Calmar, Alpha, Beta). Most importantly, it features an immutable Trade Audit Log displaying the exact algorithmic probability or rule that triggered every transaction. **Note:** Please do not change the format or spelling of anything in this README. The fields are extracted using a script, so any changes to the structure or formatting may break the extraction process. diff --git a/app.py b/app.py new file mode 100644 index 00000000..c120310c --- /dev/null +++ b/app.py @@ -0,0 +1,144 @@ +import streamlit as st +import pandas as pd +import numpy as np +import os +import sys +from pathlib import Path + +# Setup Path +ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(ROOT / "src")) + +from ingestion import load_master +from preprocessing import preprocess +from risk import compute_all_risk_metrics +from portfolio import Portfolio +from signals import MLSignalEngine + +# Use full page width +st.set_page_config(layout="wide", page_title="Hedge Fund Risk Dashboard") + +st.title("📈 Hedge Fund Risk & Trading Dashboard") +st.markdown("Interactive backtesting dashboard evaluating a **Risk-Aware Trend Following** strategy against a standard **Buy-and-Hold** approach.") + +# --- Data Loading (Cached) --- +@st.cache_data +def get_data(): + raw = load_master() + df, scaler = preprocess(raw) + return raw, df + +raw_df, df = get_data() + +# --- Simulation Runner (Cached) --- +@st.cache_data +def run_simulations(df): + # Strategy + strat_port = Portfolio(initial_capital=100_000, max_position_pct=0.95) + engine = MLSignalEngine() + + last_rebalance_idx = -999 + + + for i, row in df.iterrows(): + date = row["Date"] + prices = {"Equity": row["Equity_Price"], "Gold": row["MA_Gold_Price"], "Oil": row["Oil_Price"]} + strat_port.record_snapshot(date, prices) + + signal = engine.generate_signal(row) + nav = strat_port.compute_nav(prices) + current_weights = {asset: (strat_port.positions.get(asset, 0) * prices.get(asset, 0)) / nav if nav > 0 else 0 + for asset in prices} + + is_risk_off = "Risk-Off" in signal.reason or "De-risk" in signal.reason + max_deviation = max([abs(target_w - current_weights.get(asset, 0.0)) for asset, target_w in signal.target_weights.items()] + [0]) + + if (i - last_rebalance_idx >= 21) or (is_risk_off and max_deviation > 0.05): + strat_port.rebalance(signal.target_weights, prices, date, signal.reason) + if i - last_rebalance_idx >= 21: + last_rebalance_idx = i + + # Buy and hold + bh_port = Portfolio(initial_capital=100_000) + first_price = df.iloc[0]["Equity_Price"] + bh_port.buy("Equity", (100_000 * 0.90) // first_price, first_price, df.iloc[0]["Date"], "Initial") + + for _, row in df.iterrows(): + bh_port.record_snapshot(row["Date"], {"Equity": row["Equity_Price"], "Gold": row["MA_Gold_Price"], "Oil": row["Oil_Price"]}) + + return strat_port, bh_port + +st.sidebar.header("Running Simulation...") +with st.spinner('Running Backtest Simulation...'): + strat_port, bh_port = run_simulations(df) +st.sidebar.success("Simulation Complete!") + +strat_nav = strat_port.nav_history +bh_nav = bh_port.nav_history +strat_ret = strat_port.get_returns().reset_index(drop=True) +bh_ret = bh_port.get_returns().reset_index(drop=True) + +# Ensure benchmark returns are matched in length +equity_ret = df["Equity_Returns_clean"].values[-len(strat_ret):] if len(strat_ret) > 0 else df["Equity_Returns_clean"].values +equity_series = pd.Series(equity_ret) + +strat_metrics = compute_all_risk_metrics(strat_ret, benchmark_returns=equity_series) +bh_metrics = compute_all_risk_metrics(bh_ret, benchmark_returns=equity_series) + +# --- Display KPIs --- +col1, col2, col3, col4 = st.columns(4) +col1.metric("Strategy Total Return", f"{strat_metrics.total_return:.2%}", delta_color="normal") +col2.metric("Buy & Hold Return", f"{bh_metrics.total_return:.2%}", delta_color="normal") +col3.metric("Strategy Max Drawdown", f"{strat_metrics.drawdown.max_drawdown:.2%}") +col4.metric("Strategy Sharpe", f"{strat_metrics.sharpe_ratio:.2f}") + +st.divider() + +# --- Interactive Charts --- +st.subheader("Performance Comparison (NAV)") +chart_data = pd.DataFrame({ + 'Date': strat_nav['date'], + 'Active Strategy': strat_nav['nav'], + 'Buy & Hold': bh_nav['nav'] +}).set_index('Date') + +st.line_chart(chart_data) + +st.subheader("Drawdown Comparison") +cum_s = (1 + strat_ret).cumprod() +cum_b = (1 + bh_ret).cumprod() +dd_s = (cum_s - cum_s.cummax()) / cum_s.cummax() +dd_b = (cum_b - cum_b.cummax()) / cum_b.cummax() + +dd_data = pd.DataFrame({ + 'Date': strat_nav['date'], + 'Strategy Drawdown': dd_s.values, + 'B&H Drawdown': dd_b.values +}).set_index('Date') + +st.line_chart(dd_data) + +# --- Risk Table --- +st.subheader("Detailed Risk Metrics") +metrics_df = pd.DataFrame({ + "Metric": ["Annualised Return", "Annualised Volatility", "Sharpe Ratio", "Sortino Ratio", "VaR 95%", "Max Drawdown"], + "Active Strategy": [ + f"{strat_metrics.ann_return:.2%}", f"{strat_metrics.ann_volatility:.2%}", + f"{strat_metrics.sharpe_ratio:.2f}", f"{strat_metrics.sortino_ratio:.2f}", + f"{strat_metrics.var_95.historical:.2%}", f"{strat_metrics.drawdown.max_drawdown:.2%}" + ], + "Buy & Hold": [ + f"{bh_metrics.ann_return:.2%}", f"{bh_metrics.ann_volatility:.2%}", + f"{bh_metrics.sharpe_ratio:.2f}", f"{bh_metrics.sortino_ratio:.2f}", + f"{bh_metrics.var_95.historical:.2%}", f"{bh_metrics.drawdown.max_drawdown:.2%}" + ] +}) +st.table(metrics_df) + +# --- Trade Logs --- +st.subheader("Trade Audit Log (Last 100 Trades)") +trades = strat_port.trade_log +if not trades.empty: + st.dataframe(trades.tail(100).sort_values(by="date", ascending=False), use_container_width=True) +else: + st.write("No trades executed.") diff --git a/evaluate_ml.py b/evaluate_ml.py new file mode 100644 index 00000000..183ed29b --- /dev/null +++ b/evaluate_ml.py @@ -0,0 +1,97 @@ +""" +evaluate_ml.py +-------------- +Trains the Machine Learning model and backtests the ML Signal Engine. + +Usage: + venv\\Scripts\\python evaluate_ml.py +""" + +import sys +from pathlib import Path +import warnings + +warnings.filterwarnings("ignore") + +ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(ROOT / "src")) + +from ingestion import load_master +from preprocessing import preprocess +from ml_model import train_model +from portfolio import Portfolio +from signals import MLSignalEngine +from risk import compute_all_risk_metrics + +def evaluate(): + print("============================================================") + print(" LOADING & PREPROCESSING") + print("============================================================") + raw = load_master() + df, _ = preprocess(raw) + + print("\n============================================================") + print(" TRAINING ML MODEL") + print("============================================================") + # Train and save the model + train_model(df) + + print("\n============================================================") + print(" BACKTESTING ML STRATEGY") + print("============================================================") + + # Initialize the new ML Signal Engine + engine = MLSignalEngine() + + port = Portfolio(initial_capital=100_000, max_position_pct=0.95) + last_rebalance_idx = -999 + + for i, row in df.iterrows(): + date = row["Date"] + prices = { + "Equity": row["Equity_Price"], + "Gold": row["MA_Gold_Price"], + "Oil": row["Oil_Price"], + } + + port.record_snapshot(date, prices) + + # ML Engine requires all features to be present. If missing, it will throw an error or predict badly. + # But we dropped warmup NaNs in preprocess, so we're good. + try: + signal = engine.generate_signal(row) + except Exception as e: + # First few rows might lack rolling features if we didn't drop them all + continue + + nav = port.compute_nav(prices) + current_weights = {asset: (port.positions.get(asset, 0) * prices.get(asset, 0)) / nav if nav > 0 else 0 + for asset in prices} + + # Check max deviation + max_deviation = 0 + for asset, target_w in signal.target_weights.items(): + curr_w = current_weights.get(asset, 0.0) + max_deviation = max(max_deviation, abs(target_w - curr_w)) + + # Rebalance every 21 days OR if deviation > 10% (more lenient than rule-based to avoid overtrading) + if (i - last_rebalance_idx >= 21) or (max_deviation > 0.10): + port.rebalance(signal.target_weights, prices, date, signal.reason) + if i - last_rebalance_idx >= 21: + last_rebalance_idx = i + + strat_ret = port.get_returns() + equity_ret = df["Equity_Returns_clean"].values[-len(strat_ret):] if len(strat_ret) > 0 else df["Equity_Returns_clean"].values + + metrics = compute_all_risk_metrics(strat_ret.reset_index(drop=True), benchmark_returns=pd.Series(equity_ret), risk_free_annual=0.02) + + print("\n [ML Strategy Portfolio]") + print(metrics.summary()) + + print(f"\n Total trades executed: {len(port.trade_log)}") + print(" First 10 Trades:") + if not port.trade_log.empty: + print(port.trade_log.head(10)[["date", "asset", "action", "quantity", "reason"]].to_string()) + +if __name__ == "__main__": + evaluate() diff --git a/evaluate_phase1.py b/evaluate_phase1.py new file mode 100644 index 00000000..40976aa3 --- /dev/null +++ b/evaluate_phase1.py @@ -0,0 +1,334 @@ +""" +evaluate_phase1.py +------------------ +Phase 1 evaluation script. + +Runs the full ingestion + preprocessing pipeline and prints a +comprehensive data quality & feature report to the console. + +Usage: + python evaluate_phase1.py +""" + +import sys +import warnings +from pathlib import Path + +import numpy as np +import pandas as pd +import matplotlib +matplotlib.use("Agg") # non-interactive backend (no display needed) +import matplotlib.pyplot as plt +import matplotlib.dates as mdates + +warnings.filterwarnings("ignore") + +# ── ensure src/ is on path ──────────────────────────────────────────────────── +ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(ROOT / "src")) + +from ingestion import load_master +from preprocessing import preprocess + +REPORT_DIR = ROOT / "reports" +REPORT_DIR.mkdir(exist_ok=True) + + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +def section(title: str): + bar = "=" * 60 + print(f"\n{bar}") + print(f" {title}") + print(bar) + + +def subsection(title: str): + print(f"\n--- {title} ---") + + +# ───────────────────────────────────────────────────────────────────────────── +# Phase 1 Evaluation +# ───────────────────────────────────────────────────────────────────────────── + +def evaluate(): + + # ── STEP 1: LOAD ────────────────────────────────────────────────────────── + section("STEP 1 — Data Ingestion") + raw = load_master() + + subsection("Raw dataset overview") + print(f" Rows : {len(raw):,}") + print(f" Columns : {len(raw.columns)}") + print(f" Date range : {raw['Date'].min().date()} -> {raw['Date'].max().date()}") + print(f" Years covered : {raw['Date'].dt.year.nunique()}") + print(f" NaN totals :") + for col, n in raw.isnull().sum().items(): + if n: + print(f" {col:<30} : {n}") + + # ── STEP 2: PREPROCESS ──────────────────────────────────────────────────── + section("STEP 2 — Preprocessing") + df, scaler = preprocess(raw) + + subsection("Post-processing overview") + print(f" Rows after clean : {len(df):,}") + print(f" Total NaNs : {df.isnull().sum().sum()}") + print(f" Columns : {len(df.columns)}") + + # ── STEP 3: EQUITY RETURN STATS ─────────────────────────────────────────── + section("STEP 3 — Equity Return Statistics") + ret = df["Equity_Returns_clean"] + + ann_ret = ret.mean() * 252 + ann_vol = ret.std() * np.sqrt(252) + sharpe = ann_ret / ann_vol if ann_vol else 0 + skew = ret.skew() + kurt = ret.kurtosis() + var_95 = ret.quantile(0.05) + var_99 = ret.quantile(0.01) + + # Max drawdown + cum = (1 + ret).cumprod() + roll_max = cum.cummax() + drawdown = (cum - roll_max) / roll_max + max_dd = drawdown.min() + + print(f"\n Annualised Return : {ann_ret:>10.4%}") + print(f" Annualised Vol : {ann_vol:>10.4%}") + print(f" Sharpe Ratio : {sharpe:>10.4f}") + print(f" Skewness : {skew:>10.4f}") + print(f" Excess Kurtosis : {kurt:>10.4f}") + print(f" VaR 95% (daily) : {var_95:>10.4%}") + print(f" VaR 99% (daily) : {var_99:>10.4%}") + print(f" Max Drawdown : {max_dd:>10.4%}") + print(f" Total +ve days : {(ret > 0).sum():>10,} / {len(ret):,}") + + # ── STEP 4: MULTI-ASSET STATS ───────────────────────────────────────────── + section("STEP 4 — Multi-Asset Summary") + asset_map = { + "Equity" : "Equity_Returns_clean", + "Oil" : "Oil_Returns_clean", + "Gold" : "MA_Gold_Returns_clean", + } + + print(f"\n {'Asset':<10} {'Ann.Ret':>10} {'Ann.Vol':>10} {'Sharpe':>10} {'VaR95%':>10}") + print(f" {'-'*52}") + for name, col in asset_map.items(): + r_ = df[col] + ar = r_.mean() * 252 + av = r_.std() * np.sqrt(252) + sh = ar / av if av else 0 + v95 = r_.quantile(0.05) + print(f" {name:<10} {ar:>10.4%} {av:>10.4%} {sh:>10.4f} {v95:>10.4%}") + + # ── STEP 5: FEATURE QUALITY ─────────────────────────────────────────────── + section("STEP 5 — Engineered Features") + feat_cols = [c for c in df.columns if any(x in c for x in + ["Vol", "Momentum", "RoC", "Corr", "norm"])] + + print(f"\n {'Feature':<35} {'Mean':>10} {'Std':>10} {'NaN':>6}") + print(f" {'-'*65}") + for c in feat_cols: + print(f" {c:<35} {df[c].mean():>10.4f} {df[c].std():>10.4f} {df[c].isna().sum():>6}") + + # ── STEP 6: CORRELATION MATRIX ──────────────────────────────────────────── + section("STEP 6 — Return Correlations") + corr_cols = [ + "Equity_Returns_clean", "Oil_Returns_clean", + "MA_Gold_Returns_clean", + "Inflation_norm", "Interest_Rate_norm", + "USD_Index_norm", "Sentiment_norm" + ] + corr = df[corr_cols].corr().round(3) + + label_map = { + "Equity_Returns_clean" : "Equity", + "Oil_Returns_clean" : "Oil", + "MA_Gold_Returns_clean" : "Gold", + "Inflation_norm" : "Inflation", + "Interest_Rate_norm" : "IntRate", + "USD_Index_norm" : "USD", + "Sentiment_norm" : "Sentiment", + } + corr.rename(columns=label_map, index=label_map, inplace=True) + print(f"\n{corr.to_string()}") + + # ── STEP 7: CHARTS ──────────────────────────────────────────────────────── + section("STEP 7 — Generating Charts -> reports/") + _save_charts(df) + + # ── DONE ────────────────────────────────────────────────────────────────── + section("PHASE 1 COMPLETE") + print(" Ingestion : OK") + print(" Preprocessing: OK") + print(" Features : OK") + print(" Charts saved : reports/") + print("\n Next -> Phase 2: Risk Model (VaR, Drawdown, Sharpe)") + + +# ───────────────────────────────────────────────────────────────────────────── +# Chart Generator +# ───────────────────────────────────────────────────────────────────────────── + +def _save_charts(df: pd.DataFrame): + try: + import seaborn as sns + has_sns = True + except ImportError: + has_sns = False + + plt.rcParams.update({ + "figure.facecolor": "#0f1117", + "axes.facecolor" : "#1a1d2e", + "axes.edgecolor" : "#3a3d4e", + "text.color" : "white", + "axes.labelcolor" : "white", + "xtick.color" : "white", + "ytick.color" : "white", + "grid.color" : "#2a2d3e", + "grid.linestyle" : "--", + "grid.alpha" : 0.5, + }) + + # ── Chart 1: Equity Price + SMA ── + fig, axes = plt.subplots(2, 1, figsize=(16, 8), sharex=True) + axes[0].plot(df["Date"], df["Equity_Price"], color="#00d4ff", lw=0.8, label="Price") + axes[0].plot(df["Date"], df["Equity_SMA10"], color="#ff6b35", lw=1.0, alpha=0.8, label="SMA10") + axes[0].set_title("Equity Price & SMA10", fontsize=13) + axes[0].legend() + axes[0].grid(True) + + axes[1].bar(df["Date"], df["Equity_Volume"], color="#7c3aed", alpha=0.5, width=1) + axes[1].set_title("Daily Volume", fontsize=13) + axes[1].grid(True) + + plt.tight_layout() + plt.savefig(REPORT_DIR / "01_equity_price.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 01_equity_price.png") + + # ── Chart 2: Cumulative Returns ── + cum_equity = (1 + df["Equity_Returns_clean"]).cumprod() + cum_oil = (1 + df["Oil_Returns_clean"]).cumprod() + cum_gold = (1 + df["MA_Gold_Returns_clean"]).cumprod() + + fig, ax = plt.subplots(figsize=(16, 6)) + ax.plot(df["Date"], cum_equity, color="#00d4ff", lw=1.2, label="Equity") + ax.plot(df["Date"], cum_oil, color="#f59e0b", lw=1.2, label="Oil") + ax.plot(df["Date"], cum_gold, color="#fbbf24", lw=1.2, label="Gold") + ax.axhline(1, color="white", lw=0.5, linestyle="--", alpha=0.4) + ax.set_title("Cumulative Returns — Equity vs Oil vs Gold", fontsize=13) + ax.set_ylabel("Growth of $1") + ax.legend() + ax.grid(True) + plt.tight_layout() + plt.savefig(REPORT_DIR / "02_cumulative_returns.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 02_cumulative_returns.png") + + # ── Chart 3: Drawdown ── + cum = (1 + df["Equity_Returns_clean"]).cumprod() + roll = cum.cummax() + dd = (cum - roll) / roll + + fig, ax = plt.subplots(figsize=(16, 5)) + ax.fill_between(df["Date"], dd, 0, color="#f87171", alpha=0.7) + ax.set_title("Equity Drawdown", fontsize=13) + ax.set_ylabel("Drawdown %") + ax.grid(True) + plt.tight_layout() + plt.savefig(REPORT_DIR / "03_drawdown.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 03_drawdown.png") + + # ── Chart 4: Rolling Volatility ── + fig, ax = plt.subplots(figsize=(16, 5)) + ax.plot(df["Date"], df["Equity_RollingVol20"], color="#a78bfa", lw=1, label="Equity Vol (20d)") + ax.plot(df["Date"], df["Oil_Volatility"], color="#f59e0b", lw=1, alpha=0.7, label="Oil Vol") + ax.set_title("Rolling Volatility", fontsize=13) + ax.set_ylabel("Annualised Volatility") + ax.legend() + ax.grid(True) + plt.tight_layout() + plt.savefig(REPORT_DIR / "04_rolling_volatility.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 04_rolling_volatility.png") + + # ── Chart 5: Return Distribution ── + fig, ax = plt.subplots(figsize=(12, 5)) + ret = df["Equity_Returns_clean"] + ax.hist(ret, bins=120, color="#00d4ff", alpha=0.8, edgecolor="none") + ax.axvline(ret.quantile(0.05), color="red", lw=2, linestyle="--", label=f"VaR 95%: {ret.quantile(0.05):.3%}") + ax.axvline(ret.quantile(0.01), color="#f87171", lw=2, linestyle="--", label=f"VaR 99%: {ret.quantile(0.01):.3%}") + ax.axvline(ret.mean(), color="#10b981", lw=2, label=f"Mean: {ret.mean():.4%}") + ax.set_title("Equity Return Distribution", fontsize=13) + ax.set_xlabel("Daily Return") + ax.legend() + ax.grid(True) + plt.tight_layout() + plt.savefig(REPORT_DIR / "05_return_distribution.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 05_return_distribution.png") + + # ── Chart 6: Macro Signals ── + macro_cols = ["Inflation", "Interest_Rate", "USD_Index", "Sentiment"] + mac_colors = ["#f87171", "#fb923c", "#a78bfa", "#34d399"] + + fig, axes = plt.subplots(4, 1, figsize=(16, 12), sharex=True) + for ax, col, color in zip(axes, macro_cols, mac_colors): + ax.plot(df["Date"], df[col], color=color, lw=0.7, alpha=0.8) + ax.plot(df["Date"], df[col].rolling(30).mean(), + color="white", lw=1.2, linestyle="--", alpha=0.6, label="30d avg") + ax.set_title(col, fontsize=11) + ax.grid(True) + ax.legend(fontsize=8) + plt.suptitle("Macro Signals Over Time", fontsize=14, y=1.01) + plt.tight_layout() + plt.savefig(REPORT_DIR / "06_macro_signals.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 06_macro_signals.png") + + # ── Chart 7: Correlation heatmap ── + corr_cols = [ + "Equity_Returns_clean", "Oil_Returns_clean", + "MA_Gold_Returns_clean", + "Inflation", "Interest_Rate", "USD_Index", "Sentiment", + ] + label_map = { + "Equity_Returns_clean" : "Equity", + "Oil_Returns_clean" : "Oil", + "MA_Gold_Returns_clean" : "Gold", + "Inflation" : "Inflation", + "Interest_Rate" : "IntRate", + "USD_Index" : "USD", + "Sentiment" : "Sentiment", + } + corr = df[corr_cols].corr() + corr.rename(columns=label_map, index=label_map, inplace=True) + + fig, ax = plt.subplots(figsize=(9, 7)) + cmap = plt.cm.RdBu_r + im = ax.imshow(corr, cmap=cmap, vmin=-1, vmax=1) + plt.colorbar(im, ax=ax, fraction=0.04) + ax.set_xticks(range(len(corr.columns))) + ax.set_yticks(range(len(corr.index))) + ax.set_xticklabels(corr.columns, rotation=45, ha="right") + ax.set_yticklabels(corr.index) + for i in range(len(corr)): + for j in range(len(corr.columns)): + ax.text(j, i, f"{corr.iloc[i,j]:.2f}", + ha="center", va="center", fontsize=9, + color="white" if abs(corr.iloc[i,j]) > 0.4 else "black") + ax.set_title("Correlation Matrix", fontsize=13) + plt.tight_layout() + plt.savefig(REPORT_DIR / "07_correlation_heatmap.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 07_correlation_heatmap.png") + + +# ───────────────────────────────────────────────────────────────────────────── +if __name__ == "__main__": + evaluate() diff --git a/evaluate_phase2.py b/evaluate_phase2.py new file mode 100644 index 00000000..a4228ddb --- /dev/null +++ b/evaluate_phase2.py @@ -0,0 +1,311 @@ +""" +evaluate_phase2.py +------------------ +Phase 2 evaluation: Risk Model + Portfolio State Manager. + +Runs a simple Buy-and-Hold simulation to validate the full +risk / portfolio pipeline before building the signal engine. + +Usage: + venv\\Scripts\\python evaluate_phase2.py +""" + +import sys +import warnings +from pathlib import Path + +import numpy as np +import pandas as pd +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + +warnings.filterwarnings("ignore") + +ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(ROOT / "src")) + +from ingestion import load_master +from preprocessing import preprocess +from risk import ( + compute_all_risk_metrics, + rolling_var, + rolling_sharpe, + rolling_volatility, +) +from portfolio import Portfolio + +REPORT_DIR = ROOT / "reports" +REPORT_DIR.mkdir(exist_ok=True) + + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +def section(title: str): + bar = "=" * 60 + print(f"\n{bar}") + print(f" {title}") + print(bar) + + +# ───────────────────────────────────────────────────────────────────────────── +# Simulation: Buy-and-Hold Equity +# ───────────────────────────────────────────────────────────────────────────── + +def run_buy_and_hold(df: pd.DataFrame) -> Portfolio: + """ + Simulate a simple buy-and-hold strategy on Equity. + Buys on Day 1 with 90% of capital, holds to the end. + This validates the Portfolio class before any signal logic. + """ + port = Portfolio( + initial_capital = 100_000, + transaction_cost = 0.001, # 0.1% commission + slippage_pct = 0.0005, # 0.05% slippage + max_position_pct = 0.95, # allow up to 95% in one asset + ) + + first_row = df.iloc[0] + first_price = first_row["Equity_Price"] + first_date = first_row["Date"] + + # Allocate 90% of capital to equity on Day 1 + budget = port.initial_capital * 0.90 + quantity = budget // first_price # whole shares only + port.buy("Equity", quantity, first_price, first_date, + reason="Buy-and-hold initialisation") + + # Step through every day + for _, row in df.iterrows(): + port.record_snapshot( + date = row["Date"], + prices = { + "Equity": row["Equity_Price"], + "Gold" : row["MA_Gold_Price"], + "Oil" : row["Oil_Price"], + } + ) + + return port + + +# ───────────────────────────────────────────────────────────────────────────── +# Main Evaluation +# ───────────────────────────────────────────────────────────────────────────── + +def evaluate(): + + # ── Load + preprocess ───────────────────────────────────────────────────── + section("LOADING & PREPROCESSING") + raw = load_master() + df, _ = preprocess(raw) + print(f" Dataset: {len(df):,} rows | {df['Date'].min().date()} -> {df['Date'].max().date()}") + + # ── Portfolio simulation ─────────────────────────────────────────────────── + section("STEP 1 — Buy-and-Hold Portfolio Simulation") + port = run_buy_and_hold(df) + + final_prices = { + "Equity": df.iloc[-1]["Equity_Price"], + "Gold" : df.iloc[-1]["MA_Gold_Price"], + "Oil" : df.iloc[-1]["Oil_Price"], + } + print(port.summary(prices=final_prices)) + + nav_df = port.nav_history + port_ret = port.get_returns() + + # ── Risk Metrics ────────────────────────────────────────────────────────── + section("STEP 2 — Risk Metrics") + + equity_ret = df["Equity_Returns_clean"] + oil_ret = df["Oil_Returns_clean"] + gold_ret = df["MA_Gold_Returns_clean"] + + # Portfolio risk (using NAV returns) + port_metrics = compute_all_risk_metrics( + returns = port_ret, + benchmark_returns = equity_ret, # equity index as benchmark + dates = nav_df["date"], + risk_free_annual = 0.02, + ) + print("\n [Buy-and-Hold Portfolio]") + print(port_metrics.summary()) + + # Raw equity risk (as baseline) + eq_metrics = compute_all_risk_metrics( + returns = equity_ret, + dates = df["Date"], + risk_free_annual = 0.02, + ) + print("\n [Raw Equity Benchmark]") + print(eq_metrics.summary()) + + # ── Multi-asset Risk Table ──────────────────────────────────────────────── + section("STEP 3 — Multi-Asset Risk Comparison") + + assets = { + "Portfolio" : port_ret, + "Equity" : equity_ret, + "Oil" : oil_ret, + "Gold" : gold_ret, + } + + print(f"\n {'Asset':<12} {'Ann.Ret':>9} {'Ann.Vol':>9} {'Sharpe':>8} " + f"{'Sortino':>9} {'Calmar':>8} {'VaR95%':>8} {'MaxDD':>9}") + print(f" {'-' * 72}") + + for name, ret in assets.items(): + m = compute_all_risk_metrics(ret, risk_free_annual=0.02) + print( + f" {name:<12} {m.ann_return:>9.3%} {m.ann_volatility:>9.3%} " + f"{m.sharpe_ratio:>8.4f} {m.sortino_ratio:>9.4f} " + f"{m.calmar_ratio:>8.4f} {m.var_95.historical:>8.3%} " + f"{m.drawdown.max_drawdown:>9.3%}" + ) + + # ── Trade Log Audit ─────────────────────────────────────────────────────── + section("STEP 4 — Trade Log (Audit Trail)") + tlog = port.trade_log + print(f"\n Total trades executed: {len(tlog)}") + print(tlog.to_string(index=False)) + + # ── Charts ──────────────────────────────────────────────────────────────── + section("STEP 5 — Generating Phase 2 Charts") + _save_charts(df, nav_df, port_ret, equity_ret) + + # ── Done ────────────────────────────────────────────────────────────────── + section("PHASE 2 COMPLETE") + print(" Portfolio Manager : OK") + print(" Risk Metrics : OK") + print(" VaR / CVaR : OK") + print(" Drawdown : OK") + print(" Alpha / Beta : OK") + print(" Trade Audit Log : OK") + print(" Charts saved : reports/") + print("\n Next -> Phase 3: Signal Engine (buy/sell/hold)") + + +# ───────────────────────────────────────────────────────────────────────────── +# Charts +# ───────────────────────────────────────────────────────────────────────────── + +def _save_charts(df, nav_df, port_ret, equity_ret): + plt.rcParams.update({ + "figure.facecolor": "#0f1117", + "axes.facecolor" : "#1a1d2e", + "axes.edgecolor" : "#3a3d4e", + "text.color" : "white", + "axes.labelcolor" : "white", + "xtick.color" : "white", + "ytick.color" : "white", + "grid.color" : "#2a2d3e", + "grid.linestyle" : "--", + "grid.alpha" : 0.5, + }) + + # ── Chart 8: NAV over time ── + fig, axes = plt.subplots(2, 1, figsize=(16, 9), sharex=True) + + cum_eq = (1 + equity_ret).cumprod() * 100_000 + axes[0].plot(nav_df["date"], nav_df["nav"], color="#00d4ff", lw=1.5, label="Portfolio NAV") + axes[0].plot(df["Date"], cum_eq.values, color="#ff6b35", lw=1.0, alpha=0.7, linestyle="--", label="Equity Buy&Hold ($100K)") + axes[0].axhline(100_000, color="white", lw=0.5, linestyle=":", alpha=0.4, label="Initial Capital") + axes[0].set_title("Portfolio NAV vs Equity Benchmark", fontsize=13) + axes[0].set_ylabel("Portfolio Value ($)") + axes[0].legend(fontsize=9) + axes[0].grid(True) + + # Daily returns + axes[1].bar(nav_df["date"], nav_df["returns"], color=nav_df["returns"].apply( + lambda x: "#10b981" if x >= 0 else "#f87171"), alpha=0.7, width=1) + axes[1].axhline(0, color="white", lw=0.5) + axes[1].set_title("Daily Portfolio Returns", fontsize=13) + axes[1].set_ylabel("Return") + axes[1].grid(True) + + plt.tight_layout() + plt.savefig(REPORT_DIR / "08_portfolio_nav.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 08_portfolio_nav.png") + + # ── Chart 9: Drawdown comparison ── + fig, ax = plt.subplots(figsize=(16, 6)) + + # Portfolio drawdown + cum_p = (1 + port_ret).cumprod() + dd_p = (cum_p - cum_p.cummax()) / cum_p.cummax() + + # Equity drawdown + cum_e = (1 + equity_ret).cumprod() + dd_e = (cum_e - cum_e.cummax()) / cum_e.cummax() + + ax.fill_between(nav_df["date"], dd_p.values, color="#7c3aed", alpha=0.6, label="Portfolio DD") + ax.fill_between(df["Date"], dd_e.values, color="#f87171", alpha=0.4, label="Equity DD") + ax.axhline(0, color="white", lw=0.5) + ax.set_title("Drawdown: Portfolio vs Equity", fontsize=13) + ax.set_ylabel("Drawdown %") + ax.legend() + ax.grid(True) + plt.tight_layout() + plt.savefig(REPORT_DIR / "09_drawdown_comparison.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 09_drawdown_comparison.png") + + # ── Chart 10: Rolling Risk Metrics ── + fig, axes = plt.subplots(3, 1, figsize=(16, 12), sharex=True) + + roll_vol = rolling_volatility(equity_ret, window=20) + roll_sh = rolling_sharpe(equity_ret, window=252) + roll_var95 = rolling_var(equity_ret, window=252, confidence=0.95) + + axes[0].plot(df["Date"], roll_vol, color="#a78bfa", lw=1) + axes[0].set_title("Rolling 20-day Annualised Volatility", fontsize=11) + axes[0].set_ylabel("Volatility") + axes[0].grid(True) + + axes[1].plot(df["Date"], roll_sh, color="#34d399", lw=1) + axes[1].axhline(0, color="white", lw=0.5, linestyle="--") + axes[1].axhline(1, color="#10b981", lw=0.5, linestyle=":", alpha=0.6, label="Sharpe=1 target") + axes[1].set_title("Rolling 252-day Sharpe Ratio", fontsize=11) + axes[1].set_ylabel("Sharpe") + axes[1].legend(fontsize=8) + axes[1].grid(True) + + axes[2].fill_between(df["Date"], roll_var95, 0, color="#f87171", alpha=0.7) + axes[2].set_title("Rolling 252-day VaR (95%)", fontsize=11) + axes[2].set_ylabel("Daily VaR") + axes[2].grid(True) + + plt.suptitle("Rolling Risk Metrics — Equity", fontsize=14) + plt.tight_layout() + plt.savefig(REPORT_DIR / "10_rolling_risk.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 10_rolling_risk.png") + + # ── Chart 11: Return Distribution with VaR lines ── + fig, axes = plt.subplots(1, 2, figsize=(16, 6)) + + for ax, (label, ret) in zip(axes, [("Portfolio", port_ret), ("Equity", equity_ret)]): + ax.hist(ret.dropna(), bins=100, color="#00d4ff", alpha=0.7, edgecolor="none", density=True) + v95 = ret.quantile(0.05) + v99 = ret.quantile(0.01) + ax.axvline(v95, color="orange", lw=2, label=f"VaR 95%: {v95:.3%}") + ax.axvline(v99, color="red", lw=2, label=f"VaR 99%: {v99:.3%}") + ax.axvline(ret.mean(), color="#10b981", lw=1.5, linestyle="--", label=f"Mean: {ret.mean():.4%}") + ax.set_title(f"{label} Return Distribution", fontsize=12) + ax.set_xlabel("Daily Return") + ax.legend(fontsize=8) + ax.grid(True) + + plt.tight_layout() + plt.savefig(REPORT_DIR / "11_return_distributions.png", dpi=120, bbox_inches="tight") + plt.close() + print(" Saved: 11_return_distributions.png") + + +# ───────────────────────────────────────────────────────────────────────────── +if __name__ == "__main__": + evaluate() diff --git a/evaluate_phase3.py b/evaluate_phase3.py new file mode 100644 index 00000000..100c8ee1 --- /dev/null +++ b/evaluate_phase3.py @@ -0,0 +1,184 @@ +""" +evaluate_phase3.py +------------------ +Phase 3 evaluation: Signal Engine + Portfolio Simulator. + +Runs the Risk-Aware Signal Engine over the dataset and evaluates +its performance against the Buy-and-Hold benchmark. + +Usage: + venv\\Scripts\\python evaluate_phase3.py +""" + +import sys +import warnings +from pathlib import Path + +import numpy as np +import pandas as pd +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + +warnings.filterwarnings("ignore") + +ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(ROOT / "src")) + +from ingestion import load_master +from preprocessing import preprocess +from risk import compute_all_risk_metrics +from portfolio import Portfolio +from signals import RiskAwareSignalEngine + +REPORT_DIR = ROOT / "reports" +REPORT_DIR.mkdir(exist_ok=True) + +def section(title: str): + bar = "=" * 60 + print(f"\n{bar}") + print(f" {title}") + print(bar) + +def run_strategy(df: pd.DataFrame) -> Portfolio: + port = Portfolio( + initial_capital = 100_000, + transaction_cost = 0.001, + slippage_pct = 0.0005, + max_position_pct = 0.95, + ) + + engine = RiskAwareSignalEngine() + + # We rebalance monthly (every 21 trading days) to save transaction costs, + # unless a critical risk threshold is crossed (sentiment drops suddenly). + last_rebalance_idx = -999 + + for i, row in df.iterrows(): + date = row["Date"] + prices = { + "Equity": row["Equity_Price"], + "Gold" : row["MA_Gold_Price"], + "Oil" : row["Oil_Price"], + } + + # Always record snapshot to track daily NAV + port.record_snapshot(date=date, prices=prices) + + # Generate target weights + signal = engine.generate_signal(row) + + # Current portfolio value and allocations + nav = port.compute_nav(prices) + current_weights = {asset: (port.positions.get(asset, 0) * prices.get(asset, 0)) / nav if nav > 0 else 0 + for asset in prices} + + # Rebalance if 21 days have passed OR it's a "Risk-Off" emergency signal + is_risk_off = "Risk-Off" in signal.reason or "De-risk" in signal.reason + + # Calculate max deviation from target weights + max_deviation = 0 + for asset, target_w in signal.target_weights.items(): + curr_w = current_weights.get(asset, 0.0) + max_deviation = max(max_deviation, abs(target_w - curr_w)) + + # Only rebalance if it's been 21 days OR (it's risk-off AND we are off-target by > 5%) + # This prevents trading every single day during a prolonged risk-off regime. + should_rebalance = (i - last_rebalance_idx >= 21) or (is_risk_off and max_deviation > 0.05) + + if should_rebalance: + port.rebalance( + target_weights=signal.target_weights, + prices=prices, + date=date, + reason=signal.reason + ) + # update last rebalance index, but if we emergency traded, we still wait 21 days + # before regular rebalancing to avoid whipsawing. + if i - last_rebalance_idx >= 21: + last_rebalance_idx = i + + return port + +def run_buy_and_hold(df: pd.DataFrame) -> Portfolio: + """Run pure Buy and Hold for benchmark comparison""" + port = Portfolio(initial_capital=100_000) + first_price = df.iloc[0]["Equity_Price"] + quantity = (100_000 * 0.90) // first_price + port.buy("Equity", quantity, first_price, df.iloc[0]["Date"], "Buy and Hold") + + for _, row in df.iterrows(): + port.record_snapshot( + date=row["Date"], + prices={"Equity": row["Equity_Price"], "Gold": row["MA_Gold_Price"], "Oil": row["Oil_Price"]} + ) + return port + +def evaluate(): + section("LOADING & PREPROCESSING") + raw = load_master() + df, _ = preprocess(raw) + print(f" Dataset: {len(df):,} rows") + + section("STEP 1 — Run Strategy") + strat_port = run_strategy(df) + strat_ret = strat_port.get_returns() + + bh_port = run_buy_and_hold(df) + bh_ret = bh_port.get_returns() + + section("STEP 2 — Risk Metrics Comparison") + + equity_ret = df["Equity_Returns_clean"] + + strat_metrics = compute_all_risk_metrics(strat_ret, benchmark_returns=equity_ret, risk_free_annual=0.02) + bh_metrics = compute_all_risk_metrics(bh_ret, benchmark_returns=equity_ret, risk_free_annual=0.02) + + print("\n [Active Strategy Portfolio]") + print(strat_metrics.summary()) + + print("\n [Buy & Hold Portfolio]") + print(bh_metrics.summary()) + + section("STEP 3 — Extracting Trade Logs") + tlog = strat_port.trade_log + print(f"\n Total trades executed by Strategy: {len(tlog)}") + print(" Showing first 10 trades:") + if len(tlog) > 0: + print(tlog.head(10).to_string(index=False, columns=["date", "asset", "action", "quantity", "price", "reason"])) + + section("STEP 4 — Generating Phase 3 Charts") + _save_charts(df, strat_port.nav_history, bh_port.nav_history, strat_ret, bh_ret) + + section("PHASE 3 COMPLETE") + +def _save_charts(df, strat_nav, bh_nav, strat_ret, bh_ret): + plt.rcParams.update({"figure.facecolor": "#0f1117", "axes.facecolor": "#1a1d2e", "text.color": "white", "axes.labelcolor": "white", "xtick.color": "white", "ytick.color": "white", "grid.color": "#2a2d3e"}) + + fig, ax = plt.subplots(figsize=(16, 6)) + ax.plot(strat_nav["date"], strat_nav["nav"], color="#34d399", lw=1.5, label="Active Strategy") + ax.plot(bh_nav["date"], bh_nav["nav"], color="#ff6b35", lw=1.0, alpha=0.7, label="Buy & Hold ($100K)") + ax.axhline(100_000, color="white", lw=0.5, linestyle=":") + ax.set_title("Strategy vs Buy & Hold NAV", fontsize=13) + ax.set_ylabel("Portfolio Value ($)") + ax.legend() + ax.grid(True, linestyle="--", alpha=0.5) + plt.tight_layout() + plt.savefig(REPORT_DIR / "12_strategy_vs_bh.png", dpi=120) + plt.close() + + fig, ax = plt.subplots(figsize=(16, 5)) + cum_s = (1 + strat_ret).cumprod() + cum_b = (1 + bh_ret).cumprod() + ax.fill_between(strat_nav["date"], (cum_s - cum_s.cummax())/cum_s.cummax(), color="#34d399", alpha=0.5, label="Active Strategy DD") + ax.fill_between(bh_nav["date"], (cum_b - cum_b.cummax())/cum_b.cummax(), color="#f87171", alpha=0.4, label="Buy & Hold DD") + ax.set_title("Drawdown Comparison", fontsize=13) + ax.legend() + ax.grid(True, linestyle="--", alpha=0.5) + plt.tight_layout() + plt.savefig(REPORT_DIR / "13_strategy_drawdown.png", dpi=120) + plt.close() + print(" Saved: 12_strategy_vs_bh.png and 13_strategy_drawdown.png") + +if __name__ == "__main__": + evaluate() diff --git a/models/rf_model.joblib b/models/rf_model.joblib new file mode 100644 index 00000000..b3c5b62d Binary files /dev/null and b/models/rf_model.joblib differ diff --git a/notebooks/01_eda.ipynb b/notebooks/01_eda.ipynb new file mode 100644 index 00000000..41817f7d --- /dev/null +++ b/notebooks/01_eda.ipynb @@ -0,0 +1,359 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d2b00f42", + "metadata": {}, + "source": [ + "# 01 — EDA: Exploratory Data Analysis\n", + "**Goal:** Understand all 4 datasets, spot patterns, and validate the data before building any models.\n", + "\n", + "Datasets:\n", + "- `equity_dataset.csv` — Equity price, returns, SMA \n", + "- `macro_dataset.csv` — Inflation, Interest rates, USD, Sentiment \n", + "- `multi_asset_dataset.csv` — Oil, Gold, Bonds \n", + "- `oil_dataset.csv` — Oil price + volatility" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c80e1fe", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../src') # so we can import from src/\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "import seaborn as sns\n", + "\n", + "from ingestion import load_master, validate\n", + "\n", + "plt.style.use('dark_background')\n", + "sns.set_palette('husl')\n", + "\n", + "# Load everything in one shot\n", + "df = load_master()\n", + "validate(df)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "871cacc0", + "metadata": {}, + "source": [ + "## 1. Dataset Overview" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dadd72a5", + "metadata": { + "vscode": { + "languageId": "markdown" + } + }, + "outputs": [], + "source": [ + "print(f\"Total rows : {len(df):,}\")\n", + "print(f\"Date range : {df['Date'].min().date()} → {df['Date'].max().date()}\")\n", + "print(f\"Trading days/yr : {len(df) / df['Date'].dt.year.nunique():.0f}\")\n", + "print(f\"\\nAll columns:\")\n", + "for col in df.columns:\n", + " print(f\" {col:<25} | NaN: {df[col].isna().sum():>5} | dtype: {df[col].dtype}\")" + ] + }, + { + "cell_type": "markdown", + "id": "192b5ad8", + "metadata": {}, + "source": [ + "## 2. Equity Price Over Time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3a224b7", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 1, figsize=(16, 8), sharex=True)\n", + "\n", + "# Price\n", + "axes[0].plot(df['Date'], df['Equity_Price'], color='#00d4ff', lw=1, label='Price')\n", + "axes[0].plot(df['Date'], df['Equity_SMA10'], color='#ff6b35', lw=1, alpha=0.8, label='SMA10')\n", + "axes[0].set_title('Equity Price & 10-Day SMA', fontsize=14)\n", + "axes[0].set_ylabel('Price ($)')\n", + "axes[0].legend()\n", + "\n", + "# Volume\n", + "axes[1].bar(df['Date'], df['Equity_Volume'], color='#7c3aed', alpha=0.6, width=1)\n", + "axes[1].set_title('Daily Volume', fontsize=14)\n", + "axes[1].set_ylabel('Volume')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "9df9c5eb", + "metadata": {}, + "source": [ + "## 3. Returns Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56b60c0b", + "metadata": {}, + "outputs": [], + "source": [ + "returns = df['Equity_Returns'].dropna()\n", + "\n", + "fig, axes = plt.subplots(1, 3, figsize=(18, 5))\n", + "\n", + "# Histogram\n", + "axes[0].hist(returns, bins=100, color='#00d4ff', alpha=0.8, edgecolor='none')\n", + "axes[0].axvline(returns.mean(), color='#ff6b35', lw=2, label=f'Mean: {returns.mean():.4f}')\n", + "axes[0].axvline(returns.quantile(0.05), color='red', lw=1.5, linestyle='--', label=f'5% VaR: {returns.quantile(0.05):.4f}')\n", + "axes[0].set_title('Return Distribution')\n", + "axes[0].legend(fontsize=9)\n", + "\n", + "# Rolling returns over time\n", + "axes[1].plot(df['Date'], returns, color='#00d4ff', alpha=0.5, lw=0.5)\n", + "axes[1].axhline(0, color='white', lw=0.5, linestyle='--')\n", + "axes[1].set_title('Daily Returns Over Time')\n", + "\n", + "# Cumulative return\n", + "cumret = (1 + returns).cumprod()\n", + "axes[2].plot(df['Date'].iloc[1:], cumret.values, color='#10b981', lw=1.5)\n", + "axes[2].set_title('Cumulative Return')\n", + "axes[2].set_ylabel('Portfolio Value (starting at 1)')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(f\"Annualized Return : {returns.mean() * 365:.2%}\")\n", + "print(f\"Annualized Std : {returns.std() * np.sqrt(365):.2%}\")\n", + "print(f\"Sharpe (0% RFR) : {(returns.mean() / returns.std()) * np.sqrt(365):.3f}\")\n", + "print(f\"Max Daily Return : {returns.max():.4f}\")\n", + "print(f\"Min Daily Return : {returns.min():.4f}\")\n", + "print(f\"Skewness : {returns.skew():.4f}\")\n", + "print(f\"Kurtosis : {returns.kurtosis():.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e4325835", + "metadata": {}, + "source": [ + "## 4. Multi-Asset Comparison" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aaae1d61", + "metadata": {}, + "outputs": [], + "source": [ + "# Normalize prices to 100 at start for fair comparison\n", + "assets = {\n", + " 'Equity' : 'Equity_Price',\n", + " 'Oil' : 'Oil_Price',\n", + " 'Gold' : 'MA_Gold_Price',\n", + " 'Bonds' : 'MA_Bonds_Price'\n", + "}\n", + "\n", + "colors = ['#00d4ff', '#f59e0b', '#fbbf24', '#10b981']\n", + "\n", + "fig, ax = plt.subplots(figsize=(16, 6))\n", + "for (name, col), color in zip(assets.items(), colors):\n", + " series = df[col].dropna()\n", + " normalized = (series / series.iloc[0]) * 100\n", + " ax.plot(df['Date'].iloc[:len(series)], normalized, label=name, lw=1.2, color=color)\n", + "\n", + "ax.axhline(100, color='white', lw=0.5, linestyle='--', alpha=0.5)\n", + "ax.set_title('Asset Prices Normalized to 100 (Base = Jan 2020)', fontsize=14)\n", + "ax.set_ylabel('Normalized Price')\n", + "ax.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ed91b070", + "metadata": {}, + "source": [ + "## 5. Correlation Matrix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7701719c", + "metadata": {}, + "outputs": [], + "source": [ + "return_cols = [\n", + " 'Equity_Returns', 'Oil_Returns', 'MA_Oil_Returns',\n", + " 'MA_Gold_Returns', 'Inflation', 'Interest_Rate',\n", + " 'USD_Index', 'Sentiment'\n", + "]\n", + "\n", + "corr = df[return_cols].dropna().corr()\n", + "\n", + "plt.figure(figsize=(10, 8))\n", + "mask = np.triu(np.ones_like(corr, dtype=bool))\n", + "sns.heatmap(\n", + " corr, mask=mask, annot=True, fmt='.2f',\n", + " cmap='RdBu_r', center=0, vmin=-1, vmax=1,\n", + " linewidths=0.5, square=True\n", + ")\n", + "plt.title('Correlation Matrix — Returns & Macro Signals', fontsize=13)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "0abcf9f6", + "metadata": {}, + "source": [ + "## 6. Macro Signals Over Time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec5622a4", + "metadata": {}, + "outputs": [], + "source": [ + "macro_cols = ['Inflation', 'Interest_Rate', 'USD_Index', 'Sentiment']\n", + "colors_macro = ['#f87171', '#fb923c', '#a78bfa', '#34d399']\n", + "\n", + "fig, axes = plt.subplots(4, 1, figsize=(16, 12), sharex=True)\n", + "for ax, col, color in zip(axes, macro_cols, colors_macro):\n", + " ax.plot(df['Date'], df[col], color=color, lw=0.8)\n", + " # Rolling 30-day average\n", + " ax.plot(df['Date'], df[col].rolling(30).mean(), color='white', lw=1.5, alpha=0.7, linestyle='--')\n", + " ax.set_ylabel(col, fontsize=10)\n", + " ax.set_title(col, fontsize=11)\n", + "\n", + "plt.suptitle('Macro Signals Over Time (dashed = 30-day avg)', fontsize=14)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "34fc1e43", + "metadata": {}, + "source": [ + "## 7. Oil Volatility" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64a22158", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 1, figsize=(16, 7), sharex=True)\n", + "\n", + "axes[0].plot(df['Date'], df['Oil_Price'], color='#f59e0b', lw=1)\n", + "axes[0].set_title('Oil Price')\n", + "\n", + "axes[1].fill_between(df['Date'], df['Oil_Volatility'], color='#f87171', alpha=0.7)\n", + "axes[1].set_title('Oil Volatility (Rolling Std)')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(f\"Mean Oil Volatility : {df['Oil_Volatility'].mean():.5f}\")\n", + "print(f\"Peak Oil Volatility : {df['Oil_Volatility'].max():.5f} on {df.loc[df['Oil_Volatility'].idxmax(), 'Date'].date()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "ed8558c8", + "metadata": {}, + "source": [ + "## 8. Next Steps\n", + "\n", + "From this EDA, you should now know:\n", + "- ✅ What the overall price trend looks like\n", + "- ✅ Which assets are correlated\n", + "- ✅ How volatile the data is\n", + "- ✅ What macro regimes exist\n", + "\n", + "**→ Go to `02_preprocessing.ipynb` to clean the data and engineer features.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c589ced-c53a-4f41-8b9f-d6eb5cb970f9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f574d34b-1ca4-4d9c-9e80-ae3a21fe9a37", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cafe6ea5-e559-4e5f-8569-7b0010f4dbc4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51a8118c-f53d-4c01-b50b-012e6155b12b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..50075e75 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +pandas>=2.0.0 +numpy>=1.24.0 +matplotlib>=3.7.0 +seaborn>=0.12.0 +scikit-learn>=1.3.0 +jupyter>=1.0.0 +notebook>=7.0.0 +ipykernel>=6.0.0 +streamlit>=1.30.0 +plotly>=5.18.0 diff --git a/src/ingestion.py b/src/ingestion.py new file mode 100644 index 00000000..e1c7de0c --- /dev/null +++ b/src/ingestion.py @@ -0,0 +1,125 @@ +""" +ingestion.py +------------ +Loads, validates, and merges all 4 raw datasets into a single master DataFrame. + +Datasets: + - equity_dataset.csv : Daily equity price, volume, returns, SMA_10 + - macro_dataset.csv : Inflation, Interest_Rate, USD_Index, Sentiment + - multi_asset_dataset.csv: Oil, Gold, Bonds prices + returns + - oil_dataset.csv : Oil price, volume, returns, volatility +""" + +import pandas as pd +from pathlib import Path + +# ── Paths ──────────────────────────────────────────────────────────────────── +DATA_DIR = Path(__file__).resolve().parent.parent / "data" / "raw" + +EQUITY_PATH = DATA_DIR / "equity_dataset.csv" +MACRO_PATH = DATA_DIR / "macro_dataset.csv" +MULTI_ASSET_PATH = DATA_DIR / "multi_asset_dataset.csv" +OIL_PATH = DATA_DIR / "oil_dataset.csv" + + +# ── Loaders ────────────────────────────────────────────────────────────────── + +def load_equity() -> pd.DataFrame: + """Load equity dataset. Returns: Date, Price, Volume, Returns, SMA_10""" + df = pd.read_csv(EQUITY_PATH, parse_dates=["Date"]) + df = df.rename(columns={ + "Price": "Equity_Price", + "Volume": "Equity_Volume", + "Returns": "Equity_Returns", + "SMA_10": "Equity_SMA10" + }) + return df.sort_values("Date").reset_index(drop=True) + + +def load_macro() -> pd.DataFrame: + """Load macro dataset. Returns: Date, Inflation, Interest_Rate, USD_Index, Sentiment""" + df = pd.read_csv(MACRO_PATH, parse_dates=["Date"]) + return df.sort_values("Date").reset_index(drop=True) + + +def load_multi_asset() -> pd.DataFrame: + """Load multi-asset dataset. Returns: Date, Oil, Gold, Bonds, Oil_Returns, Gold_Returns""" + df = pd.read_csv(MULTI_ASSET_PATH, parse_dates=["Date"]) + df = df.rename(columns={ + "Oil": "MA_Oil_Price", + "Gold": "MA_Gold_Price", + "Bonds": "MA_Bonds_Price", + "Oil_Returns": "MA_Oil_Returns", + "Gold_Returns":"MA_Gold_Returns", + }) + return df.sort_values("Date").reset_index(drop=True) + + +def load_oil() -> pd.DataFrame: + """Load oil-specific dataset. Returns: Date, Price, Volume, Returns, Volatility""" + df = pd.read_csv(OIL_PATH, parse_dates=["Date"]) + df = df.rename(columns={ + "Price": "Oil_Price", + "Volume": "Oil_Volume", + "Returns": "Oil_Returns", + "Volatility": "Oil_Volatility" + }) + return df.sort_values("Date").reset_index(drop=True) + + +# ── Master Merge ────────────────────────────────────────────────────────────── + +def load_master() -> pd.DataFrame: + """ + Merge all 4 datasets on Date into a single master DataFrame. + All datasets share the same daily cadence so a simple inner join is safe. + + Returns + ------- + pd.DataFrame + Master DataFrame with all features aligned by Date. + """ + equity = load_equity() + macro = load_macro() + multi_asset = load_multi_asset() + oil = load_oil() + + # Drop redundant oil columns from multi_asset (we have them in oil_dataset) + multi_asset_cols = ["Date", "MA_Gold_Price", "MA_Bonds_Price", "MA_Gold_Returns"] + + master = ( + equity + .merge(macro, on="Date", how="inner") + .merge(multi_asset[multi_asset_cols], on="Date", how="inner") + .merge(oil[["Date", "Oil_Price", "Oil_Volume", "Oil_Returns", "Oil_Volatility"]], + on="Date", how="inner") + ) + + print(f"[ingestion] Master shape : {master.shape}") + print(f"[ingestion] Date range : {master['Date'].min().date()} -> {master['Date'].max().date()}") + print(f"[ingestion] Columns : {list(master.columns)}") + + return master + + +# ── Quick Validation ────────────────────────────────────────────────────────── + +def validate(df: pd.DataFrame) -> None: + """Print a quick health check of the master DataFrame.""" + print("\n-- NaN counts --") + nan_counts = df.isnull().sum() + print(nan_counts[nan_counts > 0].to_string() if nan_counts.any() else "No NaNs found OK") + + print("\n-- Duplicate dates --") + dupes = df['Date'].duplicated().sum() + print(f"{dupes} duplicate date rows" if dupes else "No duplicates OK") + + print("\n-- Basic stats --") + print(df.describe().round(4).to_string()) + + +# ── Entry point ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + master = load_master() + validate(master) diff --git a/src/ml_model.py b/src/ml_model.py new file mode 100644 index 00000000..8abde86f --- /dev/null +++ b/src/ml_model.py @@ -0,0 +1,113 @@ +""" +ml_model.py +----------- +Machine Learning integration for the trading strategy. + +Trains a Random Forest Classifier to predict whether the equity market +will go UP or DOWN over the next 10 days, using our engineered features. +""" + +import pandas as pd +import numpy as np +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV +from sklearn.metrics import classification_report, accuracy_score +import joblib +from pathlib import Path + +MODEL_DIR = Path(__file__).resolve().parent.parent / "models" +MODEL_DIR.mkdir(exist_ok=True) +MODEL_PATH = MODEL_DIR / "rf_model.joblib" + +FEATURES = [ + "Equity_RollingVol20", + "Equity_Momentum10", + "Equity_Momentum30", + "Inflation_RoC", + "InterestRate_RoC", + "Sentiment_norm", + "Corr_Equity_Gold30", + "Corr_Equity_Oil30" +] + +def prepare_ml_data(df: pd.DataFrame, forward_window: int = 10): + """Creates the target variable for ML training.""" + future_returns = df["Equity_Price"].shift(-forward_window) / df["Equity_Price"] - 1 + df["Target"] = (future_returns > 0.0).astype(int) + + ml_df = df.dropna(subset=FEATURES + ["Target"]).copy() + ml_df = ml_df.iloc[:-forward_window] + + return ml_df + +def train_model(df: pd.DataFrame): + """Tunes and trains the best model using TimeSeriesSplit.""" + print("[ml_model] Preparing data for hyperparameter tuning...") + ml_df = prepare_ml_data(df) + + X = ml_df[FEATURES] + y = ml_df["Target"] + + # Chronological split + split_idx = int(len(X) * 0.8) + X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:] + y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:] + + print(f"[ml_model] Starting Grid Search on {len(X_train)} samples...") + + # TimeSeriesSplit prevents look-ahead bias during cross-validation + tscv = TimeSeriesSplit(n_splits=3) + + # Define parameter grid + param_grid = { + 'n_estimators': [50, 100, 200], + 'max_depth': [3, 5, 7], + 'min_samples_leaf': [20, 50, 100] + } + + rf = RandomForestClassifier(random_state=42, n_jobs=1) + + # GridSearchCV finds the best combination + grid_search = GridSearchCV( + estimator=rf, + param_grid=param_grid, + cv=tscv, + scoring='accuracy', + n_jobs=1, + verbose=1 + ) + + grid_search.fit(X_train, y_train) + + best_model = grid_search.best_estimator_ + print(f"\n[ml_model] Best Parameters Found: {grid_search.best_params_}") + + # Evaluate best model + preds = best_model.predict(X_test) + acc = accuracy_score(y_test, preds) + print(f"[ml_model] Tuned Test Accuracy: {acc:.2%}") + print("[ml_model] Classification Report:") + print(classification_report(y_test, preds)) + + # Feature Importance + importances = pd.Series(best_model.feature_importances_, index=FEATURES).sort_values(ascending=False) + print("\n[ml_model] Feature Importances:") + print(importances.to_string()) + + # Save the best model + joblib.dump(best_model, MODEL_PATH) + print(f"[ml_model] Best model saved to {MODEL_PATH}") + return best_model + +def predict_signal(row: pd.Series, model: RandomForestClassifier) -> int: + """Predicts 1 (UP) or 0 (DOWN) for a single row of data.""" + # Extract features in the correct order + x = [row.get(f, 0.0) for f in FEATURES] + pred = model.predict([x])[0] + return int(pred) + +def get_prediction_probability(row: pd.Series, model: RandomForestClassifier) -> float: + """Returns the probability of the UP (1) class.""" + x = [row.get(f, 0.0) for f in FEATURES] + prob = model.predict_proba([x])[0][1] + return float(prob) diff --git a/src/portfolio.py b/src/portfolio.py new file mode 100644 index 00000000..e6a81840 --- /dev/null +++ b/src/portfolio.py @@ -0,0 +1,445 @@ +""" +portfolio.py +------------ +Portfolio state manager for the hedge fund backtesting system. + +Covers Issue 5 from ISSUES.md: + - Track cash, positions (shares held), PnL per asset + - Execute buy/sell orders with transaction costs and slippage (Issue 10) + - Record a full trade log with timestamps (Issue 14 - audit trail) + - Compute daily NAV (Net Asset Value) from positions + cash + +Design Principles: + - No forward-looking bias: prices are consumed one row at a time + - All state mutations go through explicit methods (no direct dict edits) + - Trade log is append-only — full audit trail preserved +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +from dataclasses import dataclass, field +from typing import Optional +from datetime import date + + +# ───────────────────────────────────────────────────────────────────────────── +# Data containers +# ───────────────────────────────────────────────────────────────────────────── + +@dataclass +class TradeRecord: + """Immutable record of a single executed trade.""" + date : object # trade date + asset : str # asset identifier + action : str # "BUY" | "SELL" + quantity : float # number of shares / units + price : float # execution price (post-slippage) + raw_price : float # price before slippage + slippage : float # slippage cost (absolute $) + commission : float # transaction cost (absolute $) + cash_before : float + cash_after : float + reason : str = "" # signal / rule that triggered the trade + + +@dataclass +class PortfolioSnapshot: + """Point-in-time snapshot of portfolio state.""" + date : object + cash : float + positions : dict # {asset: shares} + prices : dict # {asset: current price} + nav : float # total portfolio value + returns : float # day-over-day return + + +# ───────────────────────────────────────────────────────────────────────────── +# Portfolio Manager +# ───────────────────────────────────────────────────────────────────────────── + +class Portfolio: + """ + Manages cash, multi-asset positions, trade execution, and PnL. + + Parameters + ---------- + initial_capital : starting cash (default $100,000) + transaction_cost : % of trade value charged as commission (default 0.1%) + slippage_pct : % of price added/subtracted for market impact (default 0.05%) + max_position_pct : max % of NAV in a single asset (default 20%) + + Example + ------- + port = Portfolio(initial_capital=100_000) + port.buy("Equity", quantity=10, price=150.0, date="2020-01-05") + port.sell("Equity", quantity=5, price=155.0, date="2020-01-10") + print(port.summary()) + """ + + def __init__( + self, + initial_capital : float = 100_000.0, + transaction_cost : float = 0.001, # 0.1% + slippage_pct : float = 0.0005, # 0.05% + max_position_pct : float = 0.20, # 20% per asset + ): + self.initial_capital = initial_capital + self.cash = initial_capital + self.transaction_cost = transaction_cost + self.slippage_pct = slippage_pct + self.max_position_pct = max_position_pct + + # {asset_name: shares_held} + self._positions : dict[str, float] = {} + # {asset_name: average_cost_basis} + self._cost_basis: dict[str, float] = {} + + # Audit trail + self._trade_log : list[TradeRecord] = [] + self._nav_history : list[PortfolioSnapshot] = [] + + self._prev_nav = initial_capital + + # ── Properties ──────────────────────────────────────────────────────────── + + @property + def positions(self) -> dict[str, float]: + return dict(self._positions) + + @property + def trade_log(self) -> pd.DataFrame: + """Return full trade history as a DataFrame.""" + if not self._trade_log: + return pd.DataFrame() + return pd.DataFrame([t.__dict__ for t in self._trade_log]) + + @property + def nav_history(self) -> pd.DataFrame: + """Return NAV history as a DataFrame.""" + if not self._nav_history: + return pd.DataFrame() + records = [] + for s in self._nav_history: + records.append({ + "date" : s.date, + "cash" : s.cash, + "nav" : s.nav, + "returns" : s.returns, + **{f"pos_{k}": v for k, v in s.positions.items()}, + }) + return pd.DataFrame(records) + + # ── NAV Calculation ─────────────────────────────────────────────────────── + + def compute_nav(self, prices: dict[str, float]) -> float: + """ + NAV = cash + sum(shares_held[asset] * price[asset]) + + Parameters + ---------- + prices : {asset_name: current_price} — snapshot for today + """ + position_value = sum( + self._positions.get(asset, 0) * price + for asset, price in prices.items() + ) + return self.cash + position_value + + def record_snapshot( + self, + date : object, + prices: dict[str, float], + ) -> float: + """ + Record today's NAV snapshot. Call this once per simulation day. + + Returns today's NAV. + """ + nav = self.compute_nav(prices) + ret = (nav - self._prev_nav) / self._prev_nav if self._prev_nav else 0.0 + self._prev_nav = nav + + snap = PortfolioSnapshot( + date = date, + cash = self.cash, + positions = dict(self._positions), + prices = dict(prices), + nav = nav, + returns = ret, + ) + self._nav_history.append(snap) + return nav + + # ── Order Execution ─────────────────────────────────────────────────────── + + def _apply_slippage(self, price: float, action: str) -> float: + """ + Add slippage to execution price. + BUY orders pay slightly more (market moves against you). + SELL orders receive slightly less. + """ + direction = 1 if action == "BUY" else -1 + return price * (1 + direction * self.slippage_pct) + + def _apply_commission(self, trade_value: float) -> float: + """Commission = trade_value * transaction_cost_rate""" + return abs(trade_value) * self.transaction_cost + + def _check_capital_shortfall(self, required: float, label: str): + """Raise an informative error if insufficient cash (Issue 15).""" + if self.cash < required: + raise ValueError( + f"[portfolio] Capital shortfall on {label}: " + f"need ${required:,.2f}, have ${self.cash:,.2f}" + ) + + def _check_position_limit(self, asset: str, cost: float, prices: dict[str, float]): + """Raise if new position would exceed max_position_pct of NAV (Issue 9).""" + if not prices: + return + nav = self.compute_nav(prices) + existing_val = self._positions.get(asset, 0) * prices.get(asset, 0) + if (existing_val + cost) / nav > self.max_position_pct: + raise ValueError( + f"[portfolio] Position limit breach for {asset}: " + f"would exceed {self.max_position_pct:.0%} of NAV=${nav:,.2f}" + ) + + def buy( + self, + asset : str, + quantity : float, + price : float, + date : object, + reason : str = "", + prices : Optional[dict] = None, + ) -> TradeRecord: + """ + Execute a BUY order. + + Parameters + ---------- + asset : asset identifier (e.g. "Equity", "Gold") + quantity : number of shares / units to buy + price : raw market price + date : trade date (for audit log) + reason : human-readable reason (signal, rule) + prices : full {asset:price} dict for position limit check + """ + exec_price = self._apply_slippage(price, "BUY") + trade_value = exec_price * quantity + commission = self._apply_commission(trade_value) + total_cost = trade_value + commission + + # Guard rails + self._check_capital_shortfall(total_cost, f"BUY {quantity} {asset}") + if prices: + self._check_position_limit(asset, trade_value, prices) + + # Update state + cash_before = self.cash + self.cash -= total_cost + self._positions[asset] = self._positions.get(asset, 0) + quantity + + # Update cost basis (weighted average) + prev_shares = self._positions.get(asset, 0) - quantity + prev_cost = self._cost_basis.get(asset, exec_price) + if prev_shares > 0: + self._cost_basis[asset] = ( + (prev_shares * prev_cost + quantity * exec_price) + / self._positions[asset] + ) + else: + self._cost_basis[asset] = exec_price + + record = TradeRecord( + date = date, + asset = asset, + action = "BUY", + quantity = quantity, + price = exec_price, + raw_price = price, + slippage = (exec_price - price) * quantity, + commission = commission, + cash_before = cash_before, + cash_after = self.cash, + reason = reason, + ) + self._trade_log.append(record) + return record + + def sell( + self, + asset : str, + quantity : float, + price : float, + date : object, + reason : str = "", + ) -> TradeRecord: + """ + Execute a SELL order. + + Parameters + ---------- + asset : asset identifier + quantity : number of shares / units to sell + price : raw market price + date : trade date (for audit log) + reason : human-readable reason (signal, rule) + """ + held = self._positions.get(asset, 0) + if quantity > held: + raise ValueError( + f"[portfolio] Cannot sell {quantity} {asset} — only {held} held" + ) + + exec_price = self._apply_slippage(price, "SELL") + trade_value = exec_price * quantity + commission = self._apply_commission(trade_value) + proceeds = trade_value - commission + + cash_before = self.cash + self.cash += proceeds + self._positions[asset] = held - quantity + if self._positions[asset] == 0: + del self._positions[asset] + del self._cost_basis[asset] + + record = TradeRecord( + date = date, + asset = asset, + action = "SELL", + quantity = quantity, + price = exec_price, + raw_price = price, + slippage = (price - exec_price) * quantity, + commission = commission, + cash_before = cash_before, + cash_after = self.cash, + reason = reason, + ) + self._trade_log.append(record) + return record + + def rebalance( + self, + target_weights : dict[str, float], + prices : dict[str, float], + date : object, + reason : str = "rebalance", + ): + """ + Rebalance to target weight allocation (Issue 11). + + Parameters + ---------- + target_weights : {asset: target_fraction_of_NAV} — must sum to <= 1.0 + prices : {asset: current_price} + date : rebalance date + + Example + ------- + port.rebalance( + target_weights = {"Equity": 0.60, "Gold": 0.20, "Bonds": 0.10}, + prices = {"Equity": 150, "Gold": 1800, "Bonds": 100}, + date = "2020-06-30" + ) + """ + assert sum(target_weights.values()) <= 1.001, "Weights must sum to <= 1" + + nav = self.compute_nav(prices) + + # Calculate target shares and deltas + deltas = {} + for asset, weight in target_weights.items(): + target_value = nav * weight + current_price = prices.get(asset) + if current_price is None or current_price <= 0: + continue + + target_shares = target_value / current_price + current_shares = self._positions.get(asset, 0) + deltas[asset] = target_shares - current_shares + + # Sell first to free up cash + for asset, delta in deltas.items(): + if delta < -0.001: + self.sell(asset, abs(delta), prices[asset], date, reason) + + # Then buy with available cash + for asset, delta in deltas.items(): + if delta > 0.001: + # Adjust delta if we don't have enough cash (due to slippage/commission buffer) + required_cash = delta * prices[asset] * (1 + self.slippage_pct + self.transaction_cost) + if required_cash > self.cash: + delta = (self.cash * 0.99) / (prices[asset] * (1 + self.slippage_pct + self.transaction_cost)) + + if delta > 0.001: + self.buy(asset, delta, prices[asset], date, reason, prices) + + # ── Summary ─────────────────────────────────────────────────────────────── + + def summary(self, prices: Optional[dict] = None) -> str: + nav = self.compute_nav(prices) if prices else None + lines = [ + "=" * 50, + " PORTFOLIO SUMMARY", + "=" * 50, + f" Initial Capital : ${self.initial_capital:>12,.2f}", + f" Cash : ${self.cash:>12,.2f}", + ] + for asset, shares in self._positions.items(): + cb = self._cost_basis.get(asset, 0) + lines.append(f" {asset:<14} : {shares:>8.2f} shares @ avg ${cb:.2f}") + if nav: + pnl = nav - self.initial_capital + pnl_pct = pnl / self.initial_capital + lines += [ + f" NAV : ${nav:>12,.2f}", + f" PnL : ${pnl:>12,.2f} ({pnl_pct:.2%})", + ] + lines += [ + f" Total Trades : {len(self._trade_log)}", + "=" * 50, + ] + return "\n".join(lines) + + def get_returns(self) -> pd.Series: + """Extract daily NAV returns from nav_history.""" + hist = self.nav_history + if hist.empty: + return pd.Series(dtype=float) + return hist.set_index("date")["returns"] + + +# ───────────────────────────────────────────────────────────────────────────── +# Entry point — quick demo +# ───────────────────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + port = Portfolio(initial_capital=100_000, transaction_cost=0.001, slippage_pct=0.0005) + + # Day 1: buy 100 shares of Equity at $100 + port.buy("Equity", 100, 100.0, "2020-01-02", reason="Initial allocation") + port.record_snapshot("2020-01-02", {"Equity": 100.0}) + + # Day 2: price rises to $105 + port.record_snapshot("2020-01-03", {"Equity": 105.0}) + + # Day 3: buy 50 more shares + port.buy("Equity", 50, 105.0, "2020-01-04", reason="Momentum signal") + port.record_snapshot("2020-01-04", {"Equity": 105.0}) + + # Day 4: sell 80 shares at $108 + port.sell("Equity", 80, 108.0, "2020-01-07", reason="Take profit") + port.record_snapshot("2020-01-07", {"Equity": 108.0}) + + print(port.summary(prices={"Equity": 108.0})) + print() + print("Trade Log:") + print(port.trade_log.to_string()) + print() + print("NAV History:") + print(port.nav_history[["date", "cash", "nav", "returns"]].to_string()) diff --git a/src/preprocessing.py b/src/preprocessing.py new file mode 100644 index 00000000..847654ae --- /dev/null +++ b/src/preprocessing.py @@ -0,0 +1,178 @@ +""" +preprocessing.py +---------------- +Cleans the master DataFrame and engineers features needed for +risk modelling and signal generation. + +Steps: + 1. Drop / impute warmup-period NaNs + 2. Winsorize extreme returns (+/-3 sigma) + 3. Normalize macro columns (Z-score) + 4. Engineer rolling features: + - Equity rolling volatility (20-day) + - Equity momentum (10-day cumulative return) + - Macro rate-of-change + - Cross-asset rolling correlation (equity vs gold/oil) +""" + +import numpy as np +import pandas as pd +from sklearn.preprocessing import StandardScaler + + +# ── 1. NaN Handling ─────────────────────────────────────────────────────────── + +def drop_warmup_nans(df: pd.DataFrame) -> pd.DataFrame: + """ + Drop the first rows that have NaN due to rolling-window warmup. + The largest warmup window is 10 days (SMA_10, Oil_Volatility). + """ + before = len(df) + df = df.dropna(subset=["Equity_SMA10", "Oil_Volatility"]).copy() + after = len(df) + print(f"[preprocess] Dropped {before - after} warmup rows ({before} -> {after})") + return df.reset_index(drop=True) + + +def impute_remaining_nans(df: pd.DataFrame) -> pd.DataFrame: + """ + Forward-fill any remaining NaNs (e.g. mid-series gaps). + Returns columns filled are zero for returns, ffill for prices. + """ + return_cols = [c for c in df.columns if "Returns" in c] + price_cols = [c for c in df.columns if "Price" in c or "SMA" in c + or "Bonds" in c or "Index" in c] + + df[return_cols] = df[return_cols].fillna(0) + df[price_cols] = df[price_cols].ffill() + df = df.ffill() # catch anything remaining + return df + + +# ── 2. Outlier Handling ─────────────────────────────────────────────────────── + +def winsorize_returns(df: pd.DataFrame, sigma: float = 3.0) -> pd.DataFrame: + """ + Clip extreme daily returns to +/- `sigma` standard deviations. + Adds _clean suffix columns so originals are preserved. + """ + return_cols = [c for c in df.columns if "Returns" in c] + for col in return_cols: + series = df[col].dropna() + lo = series.mean() - sigma * series.std() + hi = series.mean() + sigma * series.std() + df[f"{col}_clean"] = df[col].clip(lo, hi) + n_clipped = ((df[col] < lo) | (df[col] > hi)).sum() + if n_clipped: + print(f"[preprocess] Winsorized {n_clipped:>4} rows in {col}") + return df + + +# ── 3. Normalisation ────────────────────────────────────────────────────────── + +MACRO_COLS = ["Inflation", "Interest_Rate", "USD_Index", "Sentiment"] + +def normalize_macro(df: pd.DataFrame) -> tuple[pd.DataFrame, StandardScaler]: + """ + Z-score normalize macro columns. + Returns the modified DataFrame AND the fitted scaler (for inverse-transform later). + """ + scaler = StandardScaler() + df[[f"{c}_norm" for c in MACRO_COLS]] = scaler.fit_transform(df[MACRO_COLS]) + print(f"[preprocess] Normalized {MACRO_COLS}") + return df, scaler + + +# ── 4. Feature Engineering ──────────────────────────────────────────────────── + +def add_rolling_features(df: pd.DataFrame) -> pd.DataFrame: + """ + Add derived features on top of the raw data: + + Equity: + - Equity_RollingVol20 : 20-day rolling std of returns + - Equity_Momentum10 : 10-day cumulative return + - Equity_Momentum30 : 30-day cumulative return + + Macro: + - Rate_of_Change columns for Inflation & Interest_Rate + + Cross-asset: + - Corr_Equity_Gold30 : 30-day rolling correlation + - Corr_Equity_Oil30 : 30-day rolling correlation + """ + r = "Equity_Returns_clean" + + # -- Equity volatility -- + df["Equity_RollingVol20"] = df[r].rolling(20).std() * np.sqrt(252) # annualised + + # -- Momentum (sum of returns over window) -- + df["Equity_Momentum10"] = df[r].rolling(10).sum() + df["Equity_Momentum30"] = df[r].rolling(30).sum() + + # -- Macro rate of change -- + df["Inflation_RoC"] = df["Inflation"].pct_change() + df["InterestRate_RoC"] = df["Interest_Rate"].pct_change() + + # -- Cross-asset correlations -- + df["Corr_Equity_Gold30"] = ( + df["Equity_Returns_clean"] + .rolling(30) + .corr(df["MA_Gold_Returns_clean"]) + ) + df["Corr_Equity_Oil30"] = ( + df["Equity_Returns_clean"] + .rolling(30) + .corr(df["Oil_Returns_clean"]) + ) + + # Drop the short rolling-window warmup NaNs from new features + df = df.dropna(subset=["Equity_RollingVol20", "Equity_Momentum30", + "Corr_Equity_Gold30"]).reset_index(drop=True) + + print(f"[preprocess] Feature engineering done. Shape: {df.shape}") + return df + + +# ── Master Preprocess Pipeline ──────────────────────────────────────────────── + +def preprocess(raw_df: pd.DataFrame) -> tuple[pd.DataFrame, StandardScaler]: + """ + Full pipeline: + raw_df -> cleaned + normalised + feature-engineered DataFrame + + Returns + ------- + df : processed DataFrame + scaler : fitted StandardScaler (for macro cols) + """ + df = drop_warmup_nans(raw_df) + df = impute_remaining_nans(df) + df = winsorize_returns(df) + df, scaler = normalize_macro(df) + df = add_rolling_features(df) + return df, scaler + + +# ── Entry point ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys + from pathlib import Path + sys.path.insert(0, str(Path(__file__).parent)) + + from ingestion import load_master + raw = load_master() + + df, scaler = preprocess(raw) + + print(f"\nFinal dataset shape : {df.shape}") + print(f"Date range : {df['Date'].min().date()} -> {df['Date'].max().date()}") + print(f"\nNew feature columns :") + new_cols = [c for c in df.columns if any(x in c for x in + ["Vol", "Momentum", "RoC", "Corr", "clean", "norm"])] + for c in new_cols: + print(f" {c}") + + remaining_nans = df.isnull().sum().sum() + print(f"\nRemaining NaNs: {remaining_nans}") diff --git a/src/risk.py b/src/risk.py new file mode 100644 index 00000000..2bbbf56a --- /dev/null +++ b/src/risk.py @@ -0,0 +1,494 @@ +""" +risk.py +------- +Risk metrics for the hedge fund system. + +Covers Issues 6, 7, 12, 13 from ISSUES.md: + - Issue 6 : Value at Risk (VaR) -- Historical + Parametric + CVaR + - Issue 7 : Max Drawdown & Portfolio Volatility + - Issue 12 : Sharpe Ratio (risk-adjusted return) + - Issue 13 : Alpha & Beta vs a benchmark + +All functions operate on a pandas Series of daily returns +(or a DataFrame for multi-asset calculations). +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +from dataclasses import dataclass, field +from typing import Optional + + +# ───────────────────────────────────────────────────────────────────────────── +# Data containers +# ───────────────────────────────────────────────────────────────────────────── + +@dataclass +class VaRResult: + """Value at Risk and Conditional VaR at a given confidence level.""" + confidence : float # e.g. 0.95 for 95% + historical : float # historical simulation VaR + parametric : float # Gaussian parametric VaR + cvar : float # Conditional VaR (Expected Shortfall) + period_days : int # number of daily return observations used + + def __str__(self) -> str: + return ( + f"VaR ({self.confidence:.0%} conf, {self.period_days}d)\n" + f" Historical : {self.historical:.4%}\n" + f" Parametric : {self.parametric:.4%}\n" + f" CVaR (ES) : {self.cvar:.4%}" + ) + + +@dataclass +class DrawdownResult: + """Drawdown statistics for a returns series.""" + max_drawdown : float # worst peak-to-trough drop + max_dd_start : object # date when drawdown started + max_dd_end : object # date of trough + avg_drawdown : float # average of all drawdown periods + recovery_days : int # calendar days to recover from worst drawdown + drawdown_series : pd.Series = field(repr=False) # full daily drawdown values + + def __str__(self) -> str: + return ( + f"Drawdown Analysis\n" + f" Max Drawdown : {self.max_drawdown:.4%}\n" + f" DD Start : {self.max_dd_start}\n" + f" DD Trough : {self.max_dd_end}\n" + f" Recovery Days : {self.recovery_days}\n" + f" Avg Drawdown : {self.avg_drawdown:.4%}" + ) + + +@dataclass +class RiskMetrics: + """Complete risk/return profile for a returns series.""" + # Return metrics + total_return : float + ann_return : float + ann_volatility : float + + # Risk-adjusted + sharpe_ratio : float + sortino_ratio : float + calmar_ratio : float + + # Risk + var_95 : VaRResult + var_99 : VaRResult + drawdown : DrawdownResult + + # Alpha / Beta vs benchmark + alpha : Optional[float] = None + beta : Optional[float] = None + r_squared : Optional[float] = None + + def summary(self) -> str: + lines = [ + "=" * 50, + " RISK METRICS SUMMARY", + "=" * 50, + f" Total Return : {self.total_return:.4%}", + f" Annualised Ret : {self.ann_return:.4%}", + f" Annualised Vol : {self.ann_volatility:.4%}", + f" Sharpe Ratio : {self.sharpe_ratio:.4f}", + f" Sortino Ratio : {self.sortino_ratio:.4f}", + f" Calmar Ratio : {self.calmar_ratio:.4f}", + f" VaR 95% (hist) : {self.var_95.historical:.4%}", + f" VaR 99% (hist) : {self.var_99.historical:.4%}", + f" CVaR 95% : {self.var_95.cvar:.4%}", + f" Max Drawdown : {self.drawdown.max_drawdown:.4%}", + ] + if self.alpha is not None: + lines += [ + f" Alpha : {self.alpha:.6f}", + f" Beta : {self.beta:.4f}", + f" R-squared : {self.r_squared:.4f}", + ] + lines.append("=" * 50) + return "\n".join(lines) + + +# ───────────────────────────────────────────────────────────────────────────── +# Issue 6: Value at Risk +# ───────────────────────────────────────────────────────────────────────────── + +def calculate_var( + returns : pd.Series, + confidence: float = 0.95, + dates : Optional[pd.Series] = None, +) -> VaRResult: + """ + Compute VaR and CVaR using two methods: + + Historical Simulation + Sort the observed returns and take the appropriate quantile. + No distributional assumption — uses actual fat tails. + + Parametric (Gaussian) + VaR = mu - z * sigma where z is the inverse-normal quantile. + Fast but underestimates tail risk for fat-tailed distributions. + + CVaR / Expected Shortfall + Average of all returns that breach the VaR threshold. + A coherent risk measure — always used alongside VaR. + + Parameters + ---------- + returns : daily returns Series (already cleaned / winsorized) + confidence : e.g. 0.95 or 0.99 + dates : optional Date Series (same index as returns) + + Returns + ------- + VaRResult dataclass + """ + r = returns.dropna() + alpha = 1.0 - confidence # left-tail probability + + # -- Historical -- + hist_var = float(r.quantile(alpha)) + + # -- Parametric -- + from scipy.stats import norm + mu, sigma = r.mean(), r.std() + param_var = float(mu + norm.ppf(alpha) * sigma) + + # -- CVaR (Expected Shortfall) -- + cvar = float(r[r <= hist_var].mean()) + + return VaRResult( + confidence = confidence, + historical = hist_var, + parametric = param_var, + cvar = cvar, + period_days = len(r), + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# Issue 7: Drawdown & Volatility +# ───────────────────────────────────────────────────────────────────────────── + +def calculate_drawdown( + returns : pd.Series, + dates : Optional[pd.Series] = None, +) -> DrawdownResult: + """ + Compute the drawdown series and extract key drawdown statistics. + + The drawdown at time t is: + DD(t) = (NAV(t) - peak NAV up to t) / peak NAV up to t + + Parameters + ---------- + returns : daily returns Series + dates : optional Date Series for labelling start/end + + Returns + ------- + DrawdownResult dataclass + """ + r = returns.dropna().reset_index(drop=True) + idx = dates.reset_index(drop=True) if dates is not None else pd.RangeIndex(len(r)) + + # Net Asset Value (NAV) from $1 + nav = (1 + r).cumprod() + peak = nav.cummax() + dd_ser = (nav - peak) / peak + + max_dd_loc = dd_ser.idxmin() + max_dd_val = float(dd_ser.iloc[max_dd_loc]) + + # Find the start of the worst drawdown (last time NAV was at peak before trough) + peak_before_trough = nav.iloc[:max_dd_loc + 1].idxmax() + + # Recovery: first time NAV >= peak after trough + nav_after = nav.iloc[max_dd_loc:] + peak_val = float(peak.iloc[max_dd_loc]) + recovery_mask = nav_after >= peak_val + recovery_loc = recovery_mask.idxmax() if recovery_mask.any() else None + + if recovery_loc is not None and recovery_mask.any(): + rec_days = int(recovery_loc - max_dd_loc) + else: + rec_days = -1 # still in drawdown at end of dataset + + # Average drawdown (only negative periods) + avg_dd = float(dd_ser[dd_ser < 0].mean()) if (dd_ser < 0).any() else 0.0 + + return DrawdownResult( + max_drawdown = max_dd_val, + max_dd_start = idx.iloc[peak_before_trough] if hasattr(idx, 'iloc') else peak_before_trough, + max_dd_end = idx.iloc[max_dd_loc] if hasattr(idx, 'iloc') else max_dd_loc, + avg_drawdown = avg_dd, + recovery_days = rec_days, + drawdown_series = dd_ser, + ) + + +def portfolio_volatility(returns: pd.Series, ann_factor: int = 252) -> float: + """ + Annualised portfolio volatility = daily_std * sqrt(ann_factor). + + ann_factor = 252 for daily returns (trading days per year) + """ + return float(returns.dropna().std() * np.sqrt(ann_factor)) + + +# ───────────────────────────────────────────────────────────────────────────── +# Issue 12: Sharpe & Sortino & Calmar +# ───────────────────────────────────────────────────────────────────────────── + +def sharpe_ratio( + returns : pd.Series, + risk_free : float = 0.0, + ann_factor : int = 252, +) -> float: + """ + Annualised Sharpe Ratio. + + Sharpe = (E[R] - Rf) / sigma(R) * sqrt(ann_factor) + + Parameters + ---------- + returns : daily returns + risk_free : daily risk-free rate (default 0 %) + ann_factor : trading days per year + """ + r = returns.dropna() + excess = r - risk_free + vol = excess.std() + if vol == 0: + return 0.0 + return float((excess.mean() / vol) * np.sqrt(ann_factor)) + + +def sortino_ratio( + returns : pd.Series, + risk_free : float = 0.0, + ann_factor : int = 252, +) -> float: + """ + Annualised Sortino Ratio. + + Like Sharpe but only penalises DOWNSIDE volatility — better for + strategies with skewed return distributions. + + Sortino = (E[R] - Rf) / downside_std * sqrt(ann_factor) + """ + r = returns.dropna() + excess = r - risk_free + downside = excess[excess < 0] + down_vol = downside.std() if len(downside) > 1 else 1e-9 + return float((excess.mean() / down_vol) * np.sqrt(ann_factor)) + + +def calmar_ratio( + returns : pd.Series, + ann_factor : int = 252, +) -> float: + """ + Calmar Ratio = Annualised Return / |Max Drawdown| + + Good for evaluating strategies that may have infrequent but + severe drawdowns (e.g. trend-following). + """ + ann_ret = returns.dropna().mean() * ann_factor + dd = calculate_drawdown(returns) + if dd.max_drawdown == 0: + return 0.0 + return float(ann_ret / abs(dd.max_drawdown)) + + +# ───────────────────────────────────────────────────────────────────────────── +# Issue 13: Alpha & Beta +# ───────────────────────────────────────────────────────────────────────────── + +def alpha_beta( + portfolio_returns : pd.Series, + benchmark_returns : pd.Series, + risk_free : float = 0.0, + ann_factor : int = 252, +) -> tuple[float, float, float]: + """ + Compute CAPM Alpha, Beta, and R-squared. + + Model: + R_p - Rf = alpha + beta * (R_b - Rf) + epsilon + + Parameters + ---------- + portfolio_returns : daily portfolio returns + benchmark_returns : daily benchmark returns (e.g. equity index) + risk_free : daily risk-free rate + ann_factor : for annualising alpha + + Returns + ------- + (alpha, beta, r_squared) + + Interpretation + -------------- + beta > 1 : portfolio moves more than the market + beta < 1 : portfolio is less volatile than the market + beta < 0 : portfolio moves inversely to the market + alpha > 0 : positive excess return above market risk-adjusted expectation + """ + # Align and drop NaN + df = pd.DataFrame({"p": portfolio_returns, "b": benchmark_returns}).dropna() + ep = df["p"] - risk_free + eb = df["b"] - risk_free + + # OLS: ep = alpha + beta * eb + cov_matrix = np.cov(ep, eb) + beta_val = float(cov_matrix[0, 1] / cov_matrix[1, 1]) if cov_matrix[1, 1] != 0 else 0.0 + alpha_daily = float(ep.mean() - beta_val * eb.mean()) + alpha_ann = alpha_daily * ann_factor # annualised + + # R-squared + corr = float(ep.corr(eb)) + r_sq = corr ** 2 + + return alpha_ann, beta_val, r_sq + + +# ───────────────────────────────────────────────────────────────────────────── +# Master: compute all risk metrics in one call +# ───────────────────────────────────────────────────────────────────────────── + +def compute_all_risk_metrics( + returns : pd.Series, + benchmark_returns : Optional[pd.Series] = None, + dates : Optional[pd.Series] = None, + risk_free_annual : float = 0.02, + ann_factor : int = 252, +) -> RiskMetrics: + """ + Convenience wrapper: computes every risk metric in one call. + + Parameters + ---------- + returns : daily portfolio returns (winsorized / cleaned) + benchmark_returns : daily benchmark returns for alpha/beta (optional) + dates : Date series aligned with returns + risk_free_annual : annual risk-free rate (default 2%) + ann_factor : trading days per year (252) + + Returns + ------- + RiskMetrics dataclass with .summary() method + """ + rf_daily = risk_free_annual / ann_factor + + r = returns.dropna() + + # Returns + total_ret = float((1 + r).prod() - 1) + ann_ret = float(r.mean() * ann_factor) + ann_vol = portfolio_volatility(r, ann_factor) + + # Risk-adjusted + sh = sharpe_ratio (r, rf_daily, ann_factor) + so = sortino_ratio(r, rf_daily, ann_factor) + cal = calmar_ratio (r, ann_factor) + + # VaR + v95 = calculate_var(r, 0.95, dates) + v99 = calculate_var(r, 0.99, dates) + + # Drawdown + dd = calculate_drawdown(r, dates) + + # Alpha / Beta + a, b, r2 = None, None, None + if benchmark_returns is not None: + a, b, r2 = alpha_beta(r, benchmark_returns, rf_daily, ann_factor) + + return RiskMetrics( + total_return = total_ret, + ann_return = ann_ret, + ann_volatility = ann_vol, + sharpe_ratio = sh, + sortino_ratio = so, + calmar_ratio = cal, + var_95 = v95, + var_99 = v99, + drawdown = dd, + alpha = a, + beta = b, + r_squared = r2, + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# Rolling risk (for dashboard / live monitoring) +# ───────────────────────────────────────────────────────────────────────────── + +def rolling_var( + returns : pd.Series, + window : int = 252, + confidence : float = 0.95, +) -> pd.Series: + """Rolling Historical VaR over a sliding window.""" + alpha = 1.0 - confidence + return returns.rolling(window).quantile(alpha) + + +def rolling_sharpe( + returns : pd.Series, + window : int = 252, + risk_free : float = 0.0, + ann_factor : int = 252, +) -> pd.Series: + """Rolling annualised Sharpe Ratio.""" + excess = returns - risk_free + roll_mean = excess.rolling(window).mean() + roll_std = excess.rolling(window).std() + return (roll_mean / roll_std) * np.sqrt(ann_factor) + + +def rolling_volatility( + returns : pd.Series, + window : int = 20, + ann_factor : int = 252, +) -> pd.Series: + """Rolling annualised volatility.""" + return returns.rolling(window).std() * np.sqrt(ann_factor) + + +# ───────────────────────────────────────────────────────────────────────────── +# Entry point — quick self-test +# ───────────────────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys + from pathlib import Path + sys.path.insert(0, str(Path(__file__).parent)) + + from ingestion import load_master + from preprocessing import preprocess + + raw = load_master() + df, _ = preprocess(raw) + + ret = df["Equity_Returns_clean"] + benchmark = df["Oil_Returns_clean"] # use oil as benchmark proxy + dates = df["Date"] + + metrics = compute_all_risk_metrics( + returns = ret, + benchmark_returns = benchmark, + dates = dates, + risk_free_annual = 0.02, + ) + + print(metrics.summary()) + print() + print(metrics.var_95) + print() + print(metrics.drawdown) diff --git a/src/signals.py b/src/signals.py new file mode 100644 index 00000000..51dc184f --- /dev/null +++ b/src/signals.py @@ -0,0 +1,174 @@ +""" +signals.py +---------- +Signal Generation Engine for the semi-automated trading system. + +Implements a Risk-Aware Trend Following strategy (Issue 8 & 16) that: + 1. Detects market momentum (trend) + 2. Overlays macro sentiment & volatility for risk mitigation + 3. Outputs target portfolio weights for rebalancing + +This fulfills the requirement to build an "explainable" strategy +that aims to minimize drawdowns while capturing upside. +""" + +import pandas as pd +from dataclasses import dataclass +from typing import Dict, Tuple + +@dataclass +class SignalResult: + """The output of the signal engine for a single day.""" + date: object + target_weights: Dict[str, float] + reason: str + + +class RiskAwareSignalEngine: + """ + A rule-based, explainable signal generator. + + Logic: + - Base: Follow Equity momentum (30-day). If positive, go LONG Equity. + - Risk Overlay 1: High Volatility. If rolling vol > threshold, reduce Equity, buy Gold. + - Risk Overlay 2: Macro Sentiment. If sentiment < threshold, move to Cash/Bonds. + """ + + def __init__( + self, + vol_threshold: float = 0.20, # 20% annualized vol limit + sentiment_threshold: float = -0.5, # negative sentiment threshold (normalized) + max_equity_weight: float = 0.90, # Never go 100% equity + ): + self.vol_threshold = vol_threshold + self.sentiment_threshold = sentiment_threshold + self.max_equity_weight = max_equity_weight + + def generate_signal(self, row: pd.Series) -> SignalResult: + """ + Evaluate a single day's data and return target portfolio weights. + + Parameters + ---------- + row : pd.Series representing one row of the preprocessed DataFrame + """ + # Read current state + momentum = row.get("Equity_Momentum30", 0.0) + vol = row.get("Equity_RollingVol20", 0.0) + sent = row.get("Sentiment_norm", 0.0) + + date = row["Date"] + + # Default: All Cash + weights = {"Equity": 0.0, "Gold": 0.0} + reasons = [] + + # 1. Risk Overlay: Extreme Macro Fear + if sent < self.sentiment_threshold: + weights["Gold"] = 0.50 + reasons.append(f"Risk-Off: Sentiment ({sent:.2f}) < {self.sentiment_threshold}") + + # 2. Risk Overlay: High Volatility + elif vol > self.vol_threshold: + # Reduce equity exposure, hedge with gold + weights["Equity"] = 0.40 + weights["Gold"] = 0.40 + reasons.append(f"De-risk: Volatility ({vol:.1%}) > {self.vol_threshold:.1%}") + + # 3. Base Strategy: Trend Following + else: + if momentum > 0: + # Up-trend: Max allocation to Equity + weights["Equity"] = self.max_equity_weight + weights["Gold"] = 0.0 + reasons.append(f"Trend-On: Momentum ({momentum:.2%}) > 0") + else: + # Down-trend: Move to Cash + weights["Equity"] = 0.0 + weights["Gold"] = 0.0 + reasons.append(f"Trend-Off: Momentum ({momentum:.2%}) <= 0") + + # Clean up reason string + final_reason = " | ".join(reasons) if reasons else "No clear signal, holding cash." + + return SignalResult( + date=date, + target_weights=weights, + reason=final_reason + ) + +class MLSignalEngine: + """ + An ML-driven signal generator using the Random Forest classifier. + + Logic: + - If ML predicts UP (prob > threshold), allocate strongly to Equity. + - If ML predicts DOWN, allocate to Gold. + - Also retains the extreme Volatility Risk Overlay for safety. + """ + + def __init__( + self, + vol_threshold: float = 0.25, + prob_threshold: float = 0.55, + max_equity_weight: float = 0.90, + ): + import joblib + from pathlib import Path + + self.vol_threshold = vol_threshold + self.prob_threshold = prob_threshold + self.max_equity_weight = max_equity_weight + + # Load the model + model_path = Path(__file__).resolve().parent.parent / "models" / "rf_model.joblib" + if not model_path.exists(): + raise FileNotFoundError(f"ML Model not found at {model_path}. Please run evaluate_ml.py first to train it.") + + self.model = joblib.load(model_path) + + def generate_signal(self, row: pd.Series) -> SignalResult: + from ml_model import get_prediction_probability + + date = row["Date"] + vol = row.get("Equity_RollingVol20", 0.0) + + # Get ML prediction probability for UP (1) + up_prob = get_prediction_probability(row, self.model) + + weights = {"Equity": 0.0, "Gold": 0.0} + reasons = [] + + # 1. Extreme Risk Overlay (Trumps ML) + if vol > self.vol_threshold: + weights["Equity"] = 0.20 + weights["Gold"] = 0.50 + reasons.append(f"De-risk Override: Volatility ({vol:.1%}) > {self.vol_threshold:.1%}") + + # 2. ML Prediction + else: + if up_prob >= self.prob_threshold: + # Strong conviction UP + weights["Equity"] = self.max_equity_weight + weights["Gold"] = 0.0 + reasons.append(f"ML Long: Up probability {up_prob:.1%} >= {self.prob_threshold:.1%}") + elif up_prob <= (1 - self.prob_threshold): + # Strong conviction DOWN + weights["Equity"] = 0.0 + weights["Gold"] = 0.50 + reasons.append(f"ML Short: Down probability {(1-up_prob):.1%} >= {self.prob_threshold:.1%}") + else: + # Low conviction -> Hold balanced/cash + weights["Equity"] = 0.40 + weights["Gold"] = 0.20 + reasons.append(f"ML Neutral: Prob {up_prob:.1%} (Low conviction)") + + final_reason = " | ".join(reasons) + + return SignalResult( + date=date, + target_weights=weights, + reason=final_reason + ) + +