#!/usr/bin/env python3
"""
╔══════════════════════════════════════════════════════════════════════╗
║          ANALYSE PROFONDE: CORRÉLATION SPY vs MARCHÉ CRYPTO        ║
║                                                                      ║
║  Objectif: Valider mathématiquement la relation entre les            ║
║  performances du spy et les tendances du marché crypto.              ║
║                                                                      ║
║  Méthodes:                                                           ║
║   1. Collecte 1 an de données Binance (daily + hourly)              ║
║   2. Simulation de surges sur données historiques                    ║
║   3. Feature engineering marché (BTC trend, volatilité, volume)     ║
║   4. ML: XGBoost + Random Forest (feature importance)               ║
║   5. Statistiques: Granger causalité, corrélations, régimes         ║
║                                                                      ║
║  Date: 2026-04-06                                                    ║
╚══════════════════════════════════════════════════════════════════════╝
"""

import json
import time
import os
import sys
import warnings
from datetime import datetime, timedelta
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd
import requests
from scipy import stats as scipy_stats
from scipy.signal import find_peaks

warnings.filterwarnings('ignore')

# ═══════════════════════════════════════════════════════════════════════
# CONFIGURATION
# ═══════════════════════════════════════════════════════════════════════

BINANCE_BASE = "https://api.binance.com"
LOOKBACK_DAYS = 365  # 1 year
MIN_VOLUME_USDT = 5_000_000  # $5M daily volume minimum

# Surge detection thresholds (mirror market_spy.py)
SURGE_THRESHOLD_FLASH = 1.0    # +1.0% in 1 candle
SURGE_THRESHOLD_BREAKOUT = 1.5  # +1.5% over 2 candles
SURGE_THRESHOLD_MOMENTUM = 4.0  # +4% over 20 candles (gradual)

# Holding simulation params
SIM_HOLD_PERIODS = [1, 2, 3, 5, 10, 15, 30]  # candles after surge
SIM_STOP_LOSS = -1.2   # %
SIM_TRAILING_LEVELS = {1.0: 0.3, 2.0: 1.0, 3.0: 1.8, 5.0: 2.5, 10.0: 4.0}

RESULTS_FILE = "deep_spy_market_analysis_results.json"


def log(msg, level="INFO"):
    ts = datetime.now().strftime("%H:%M:%S")
    print(f"[{ts}] [{level}] {msg}")


# ═══════════════════════════════════════════════════════════════════════
# PHASE 1: DATA COLLECTION FROM BINANCE
# ═══════════════════════════════════════════════════════════════════════

def get_top_symbols(min_volume=MIN_VOLUME_USDT, max_symbols=50):
    """Get top USDT pairs by volume from Binance."""
    log("Collecting top symbols by volume...")
    url = f"{BINANCE_BASE}/api/v3/ticker/24hr"
    resp = requests.get(url, timeout=30)
    resp.raise_for_status()
    tickers = resp.json()

    stablecoins = {'BUSDUSDT', 'USDCUSDT', 'TUSDUSDT', 'FDUSDUSDT', 'DAIUSDT',
                   'EURUSDT', 'GBPUSDT', 'USDTUSDT', 'USDPUSDT'}

    pairs = []
    for t in tickers:
        sym = t['symbol']
        if not sym.endswith('USDT') or sym in stablecoins:
            continue
        vol = float(t.get('quoteVolume', 0))
        price = float(t.get('lastPrice', 0))
        if vol >= min_volume and price > 0.0005:
            pairs.append({
                'symbol': sym,
                'volume': vol,
                'price': price,
                'change_24h': float(t.get('priceChangePercent', 0))
            })

    pairs.sort(key=lambda x: x['volume'], reverse=True)
    selected = pairs[:max_symbols]
    log(f"Selected {len(selected)} symbols (min vol ${min_volume/1e6:.0f}M)")
    return selected


def fetch_klines(symbol, interval, days, end_time=None):
    """Fetch historical klines from Binance."""
    all_klines = []
    limit = 1000

    if end_time is None:
        end_time = int(datetime.now().timestamp() * 1000)

    # Calculate start time
    start_time = end_time - (days * 24 * 3600 * 1000)

    current_start = start_time
    while current_start < end_time:
        url = f"{BINANCE_BASE}/api/v3/klines"
        params = {
            'symbol': symbol,
            'interval': interval,
            'startTime': current_start,
            'endTime': end_time,
            'limit': limit
        }

        try:
            resp = requests.get(url, params=params, timeout=30)
            if resp.status_code == 429:
                time.sleep(10)
                continue
            resp.raise_for_status()
            data = resp.json()
            if not data:
                break
            all_klines.extend(data)
            # Move start to after last candle
            current_start = data[-1][0] + 1
            time.sleep(0.05)  # Rate limiting
        except Exception as e:
            log(f"Error fetching {symbol} {interval}: {e}", "WARN")
            time.sleep(1)
            break

    return all_klines


def klines_to_df(klines):
    """Convert Binance klines to DataFrame."""
    if not klines:
        return pd.DataFrame()

    df = pd.DataFrame(klines, columns=[
        'open_time', 'open', 'high', 'low', 'close', 'volume',
        'close_time', 'quote_volume', 'trades', 'taker_buy_base',
        'taker_buy_quote', 'ignore'
    ])
    df['open_time'] = pd.to_datetime(df['open_time'], unit='ms')
    df['close_time'] = pd.to_datetime(df['close_time'], unit='ms')
    for col in ['open', 'high', 'low', 'close', 'volume', 'quote_volume',
                'taker_buy_base', 'taker_buy_quote']:
        df[col] = df[col].astype(float)
    df['trades'] = df['trades'].astype(int)
    df.set_index('open_time', inplace=True)
    return df


def collect_all_data(symbols):
    """Collect daily + hourly data for all symbols."""
    log("=" * 60)
    log("PHASE 1: COLLECTING BINANCE HISTORICAL DATA")
    log("=" * 60)

    daily_data = {}
    hourly_data = {}

    # Always include BTC and ETH
    must_have = {'BTCUSDT', 'ETHUSDT'}
    sym_list = list(must_have) + [s['symbol'] for s in symbols if s['symbol'] not in must_have]
    sym_list = sym_list[:50]

    log(f"Collecting daily klines for {len(sym_list)} symbols (1 year)...")
    for i, sym in enumerate(sym_list):
        klines = fetch_klines(sym, '1d', LOOKBACK_DAYS)
        df = klines_to_df(klines)
        if len(df) > 30:
            daily_data[sym] = df
        if (i + 1) % 10 == 0:
            log(f"  Daily progress: {i+1}/{len(sym_list)} symbols")

    log(f"Collected daily data for {len(daily_data)} symbols")

    # Hourly data for top 30 by volume (for surge simulation)
    hourly_symbols = sym_list[:30]
    log(f"Collecting hourly klines for {len(hourly_symbols)} symbols (1 year)...")
    for i, sym in enumerate(hourly_symbols):
        klines = fetch_klines(sym, '1h', LOOKBACK_DAYS)
        df = klines_to_df(klines)
        if len(df) > 100:
            hourly_data[sym] = df
        if (i + 1) % 5 == 0:
            log(f"  Hourly progress: {i+1}/{len(hourly_symbols)} symbols")

    log(f"Collected hourly data for {len(hourly_data)} symbols")
    return daily_data, hourly_data


# ═══════════════════════════════════════════════════════════════════════
# PHASE 2: FEATURE ENGINEERING
# ═══════════════════════════════════════════════════════════════════════

def compute_market_features(daily_data):
    """Compute daily market-level features from BTC + altcoins."""
    log("=" * 60)
    log("PHASE 2: FEATURE ENGINEERING — Market Conditions")
    log("=" * 60)

    btc = daily_data.get('BTCUSDT')
    if btc is None or len(btc) < 60:
        log("BTC data insufficient!", "ERROR")
        return pd.DataFrame()

    # Start from date where we have BTC data
    dates = btc.index

    features = pd.DataFrame(index=dates)

    # --- BTC Features ---
    features['btc_close'] = btc['close']
    features['btc_return_1d'] = btc['close'].pct_change() * 100
    features['btc_return_3d'] = btc['close'].pct_change(3) * 100
    features['btc_return_7d'] = btc['close'].pct_change(7) * 100
    features['btc_return_14d'] = btc['close'].pct_change(14) * 100
    features['btc_return_30d'] = btc['close'].pct_change(30) * 100
    features['btc_volatility_7d'] = features['btc_return_1d'].rolling(7).std()
    features['btc_volatility_30d'] = features['btc_return_1d'].rolling(30).std()
    features['btc_volume'] = btc['quote_volume']
    features['btc_volume_sma20'] = btc['quote_volume'].rolling(20).mean()
    features['btc_volume_ratio'] = btc['quote_volume'] / features['btc_volume_sma20']

    # BTC momentum indicators
    features['btc_sma7'] = btc['close'].rolling(7).mean()
    features['btc_sma20'] = btc['close'].rolling(20).mean()
    features['btc_sma50'] = btc['close'].rolling(50).mean()
    features['btc_above_sma7'] = (btc['close'] > features['btc_sma7']).astype(int)
    features['btc_above_sma20'] = (btc['close'] > features['btc_sma20']).astype(int)
    features['btc_above_sma50'] = (btc['close'] > features['btc_sma50']).astype(int)
    features['btc_trend_score'] = features['btc_above_sma7'] + features['btc_above_sma20'] + features['btc_above_sma50']

    # BTC RSI (14)
    delta = btc['close'].diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / loss.replace(0, np.nan)
    features['btc_rsi_14'] = 100 - (100 / (1 + rs))

    # BTC candle analysis
    features['btc_body_pct'] = ((btc['close'] - btc['open']) / btc['open'] * 100).abs()
    features['btc_upper_wick'] = ((btc['high'] - btc[['open', 'close']].max(axis=1)) / btc['open'] * 100)
    features['btc_lower_wick'] = ((btc[['open', 'close']].min(axis=1) - btc['low']) / btc['open'] * 100)

    # --- Altcoin aggregate features ---
    alt_returns = []
    alt_volumes = []
    alt_volatilities = []

    for sym, df in daily_data.items():
        if sym == 'BTCUSDT' or len(df) < 60:
            continue
        ret = df['close'].pct_change() * 100
        ret = ret.reindex(dates)
        alt_returns.append(ret)
        vol = df['quote_volume'].reindex(dates)
        alt_volumes.append(vol)

    if alt_returns:
        alt_ret_df = pd.DataFrame(alt_returns).T
        features['alt_mean_return_1d'] = alt_ret_df.mean(axis=1)
        features['alt_median_return_1d'] = alt_ret_df.median(axis=1)
        features['alt_std_return_1d'] = alt_ret_df.std(axis=1)
        features['alt_pct_positive'] = (alt_ret_df > 0).mean(axis=1) * 100
        features['alt_pct_pump_1pct'] = (alt_ret_df > 1).mean(axis=1) * 100
        features['alt_pct_pump_3pct'] = (alt_ret_df > 3).mean(axis=1) * 100
        features['alt_pct_dump_1pct'] = (alt_ret_df < -1).mean(axis=1) * 100
        features['alt_max_return'] = alt_ret_df.max(axis=1)
        features['alt_min_return'] = alt_ret_df.min(axis=1)
        features['alt_dispersion'] = features['alt_max_return'] - features['alt_min_return']

        # Altcoin vs BTC divergence
        features['alt_btc_divergence'] = features['alt_mean_return_1d'] - features['btc_return_1d']

        # Rolling altcoin features
        features['alt_mean_return_7d'] = features['alt_mean_return_1d'].rolling(7).mean()
        features['alt_volatility_7d'] = features['alt_std_return_1d'].rolling(7).mean()

    if alt_volumes:
        alt_vol_df = pd.DataFrame(alt_volumes).T
        features['market_total_volume'] = alt_vol_df.sum(axis=1) + btc['quote_volume']
        features['market_volume_sma20'] = features['market_total_volume'].rolling(20).mean()
        features['market_volume_ratio'] = features['market_total_volume'] / features['market_volume_sma20']

    # --- ETH specific (important benchmark) ---
    eth = daily_data.get('ETHUSDT')
    if eth is not None and len(eth) > 30:
        eth_ret = eth['close'].pct_change() * 100
        eth_ret = eth_ret.reindex(dates)
        features['eth_return_1d'] = eth_ret
        features['eth_return_7d'] = eth['close'].pct_change(7).reindex(dates) * 100
        features['eth_btc_ratio'] = (eth['close'] / btc['close']).reindex(dates)

    # --- Market regime classification ---
    features['market_regime'] = 'NEUTRAL'
    bull_mask = (features['btc_return_7d'] > 3) & (features['alt_pct_positive'] > 60)
    bear_mask = (features['btc_return_7d'] < -3) & (features['alt_pct_positive'] < 40)
    correction_mask = (features['btc_return_7d'] < -1) & ~bear_mask
    features.loc[bull_mask, 'market_regime'] = 'BULL'
    features.loc[bear_mask, 'market_regime'] = 'BEAR'
    features.loc[correction_mask, 'market_regime'] = 'CORRECTION'

    # Day of week
    features['day_of_week'] = features.index.dayofweek

    log(f"Computed {len(features.columns)} market features over {len(features)} days")
    return features


# ═══════════════════════════════════════════════════════════════════════
# PHASE 3: SURGE SIMULATION ON HOURLY DATA
# ═══════════════════════════════════════════════════════════════════════

def simulate_surges_hourly(hourly_data):
    """Detect surges on hourly data and simulate spy-like entry/exit."""
    log("=" * 60)
    log("PHASE 3: SIMULATING SURGES ON 1-YEAR HOURLY DATA")
    log("=" * 60)

    all_surges = []
    surge_count = 0

    for sym, df in hourly_data.items():
        if len(df) < 100:
            continue

        closes = df['close'].values
        highs = df['high'].values
        lows = df['low'].values
        volumes = df['quote_volume'].values
        times = df.index

        # Compute hourly returns
        returns = np.diff(closes) / closes[:-1] * 100

        for i in range(2, len(returns)):
            surge_type = None
            surge_pct = 0

            # FLASH_SURGE: single candle >= +1%
            if returns[i] >= SURGE_THRESHOLD_FLASH:
                surge_type = 'FLASH_SURGE'
                surge_pct = returns[i]

            # BREAKOUT_SURGE: 2 candles >= +1.5% and last >= +0.5%
            elif (returns[i] >= 0.5 and
                  returns[i] + returns[i-1] >= SURGE_THRESHOLD_BREAKOUT):
                surge_type = 'BREAKOUT_SURGE'
                surge_pct = returns[i] + returns[i-1]

            # MOMENTUM_SURGE: gradual rise over several candles
            elif i >= 5:
                total_5h = sum(returns[i-4:i+1])
                if total_5h >= SURGE_THRESHOLD_MOMENTUM and returns[i] >= 0.3:
                    green_count = sum(1 for r in returns[i-4:i+1] if r > 0)
                    if green_count >= 3:
                        surge_type = 'MOMENTUM_SURGE'
                        surge_pct = total_5h

            if surge_type is None:
                continue

            entry_idx = i + 1  # Enter at next candle open
            if entry_idx >= len(closes):
                continue

            entry_price = closes[entry_idx]
            entry_time = times[entry_idx]

            # Simulate hold and exit
            max_pnl = 0
            exit_pnl = 0
            exit_reason = 'MAX_HOLD'
            hold_candles = 0

            for h in range(1, min(16, len(closes) - entry_idx)):
                current_price = closes[entry_idx + h]
                pnl_pct = (current_price - entry_price) / entry_price * 100
                max_pnl = max(max_pnl, pnl_pct)

                # Hard stop loss
                low_price = lows[entry_idx + h]
                low_pnl = (low_price - entry_price) / entry_price * 100
                if low_pnl <= SIM_STOP_LOSS:
                    exit_pnl = SIM_STOP_LOSS
                    exit_reason = 'HARD_SL'
                    hold_candles = h
                    break

                # Trailing stop
                trail = 0.3  # default
                for level, t_val in sorted(SIM_TRAILING_LEVELS.items()):
                    if max_pnl >= level:
                        trail = t_val
                if max_pnl >= 0.5 and pnl_pct < max_pnl - trail:
                    exit_pnl = pnl_pct
                    exit_reason = 'TRAILING_STOP'
                    hold_candles = h
                    break

                # Stagnation (after 6 candles, if < 0.5%)
                if h >= 6 and pnl_pct < 0.5:
                    exit_pnl = pnl_pct
                    exit_reason = 'STAGNATION'
                    hold_candles = h
                    break

                hold_candles = h
                exit_pnl = pnl_pct

            # Record surge
            date_str = entry_time.strftime('%Y-%m-%d')
            all_surges.append({
                'symbol': sym,
                'date': date_str,
                'hour': entry_time.hour,
                'surge_type': surge_type,
                'surge_pct': round(surge_pct, 3),
                'entry_price': round(entry_price, 6),
                'exit_pnl_pct': round(exit_pnl, 3),
                'max_pnl_pct': round(max_pnl, 3),
                'hold_candles': hold_candles,
                'exit_reason': exit_reason,
                'is_win': exit_pnl > 0,
                'volume_at_surge': volumes[i] if i < len(volumes) else 0,
            })
            surge_count += 1

        log(f"  {sym}: found {sum(1 for s in all_surges if s['symbol'] == sym)} surges")

    log(f"Total surges detected: {surge_count}")
    return pd.DataFrame(all_surges)


def aggregate_daily_surges(surges_df):
    """Aggregate surge data per day for correlation with market features."""
    if surges_df.empty:
        return pd.DataFrame()

    daily = surges_df.groupby('date').agg(
        surge_count=('exit_pnl_pct', 'count'),
        surge_wins=('is_win', 'sum'),
        surge_win_rate=('is_win', 'mean'),
        surge_mean_pnl=('exit_pnl_pct', 'mean'),
        surge_median_pnl=('exit_pnl_pct', 'median'),
        surge_total_pnl=('exit_pnl_pct', 'sum'),
        surge_max_pnl=('max_pnl_pct', 'max'),
        surge_mean_max_pnl=('max_pnl_pct', 'mean'),
        surge_mean_magnitude=('surge_pct', 'mean'),
        surge_flash_count=('surge_type', lambda x: (x == 'FLASH_SURGE').sum()),
        surge_breakout_count=('surge_type', lambda x: (x == 'BREAKOUT_SURGE').sum()),
        surge_momentum_count=('surge_type', lambda x: (x == 'MOMENTUM_SURGE').sum()),
        surge_mean_hold=('hold_candles', 'mean'),
        surge_sl_count=('exit_reason', lambda x: (x == 'HARD_SL').sum()),
        surge_trail_count=('exit_reason', lambda x: (x == 'TRAILING_STOP').sum()),
        unique_symbols=('symbol', 'nunique'),
    ).reset_index()

    daily['date'] = pd.to_datetime(daily['date'])
    daily.set_index('date', inplace=True)
    daily['surge_sl_rate'] = daily['surge_sl_count'] / daily['surge_count']
    daily['surge_trail_rate'] = daily['surge_trail_count'] / daily['surge_count']
    daily['surge_win_rate'] = daily['surge_win_rate'] * 100

    log(f"Aggregated daily surge data: {len(daily)} days")
    return daily


# ═══════════════════════════════════════════════════════════════════════
# PHASE 4: STATISTICAL ANALYSIS
# ═══════════════════════════════════════════════════════════════════════

def statistical_analysis(merged_df):
    """Comprehensive statistical analysis of market-surge relationships."""
    log("=" * 60)
    log("PHASE 4: STATISTICAL ANALYSIS")
    log("=" * 60)

    results = {}

    # --- 4.1 Pearson & Spearman Correlations ---
    log("Computing correlation matrices...")

    market_cols = [
        'btc_return_1d', 'btc_return_7d', 'btc_return_14d', 'btc_return_30d',
        'btc_volatility_7d', 'btc_volatility_30d', 'btc_volume_ratio',
        'btc_rsi_14', 'btc_trend_score',
        'alt_mean_return_1d', 'alt_pct_positive', 'alt_pct_pump_1pct',
        'alt_pct_pump_3pct', 'alt_dispersion', 'alt_btc_divergence',
        'market_volume_ratio'
    ]

    spy_cols = [
        'surge_count', 'surge_win_rate', 'surge_mean_pnl', 'surge_total_pnl',
        'surge_mean_magnitude', 'surge_flash_count', 'surge_mean_max_pnl',
        'surge_sl_rate', 'surge_trail_rate'
    ]

    available_market = [c for c in market_cols if c in merged_df.columns]
    available_spy = [c for c in spy_cols if c in merged_df.columns]

    correlations = {}
    for mc in available_market:
        for sc in available_spy:
            clean = merged_df[[mc, sc]].dropna()
            if len(clean) < 20:
                continue
            pearson_r, pearson_p = scipy_stats.pearsonr(clean[mc], clean[sc])
            spearman_r, spearman_p = scipy_stats.spearmanr(clean[mc], clean[sc])
            correlations[f"{mc} vs {sc}"] = {
                'pearson_r': round(pearson_r, 4),
                'pearson_p': round(pearson_p, 6),
                'spearman_r': round(spearman_r, 4),
                'spearman_p': round(spearman_p, 6),
                'significant_5pct': pearson_p < 0.05,
                'significant_1pct': pearson_p < 0.01,
            }

    # Sort by absolute Pearson correlation
    sorted_corr = dict(sorted(correlations.items(),
                                key=lambda x: abs(x[1]['pearson_r']),
                                reverse=True))
    results['correlations'] = sorted_corr

    # Top correlations
    log(f"Top correlations (|r| > 0.15):")
    for k, v in sorted_corr.items():
        if abs(v['pearson_r']) > 0.15:
            sig = "***" if v['significant_1pct'] else ("**" if v['significant_5pct'] else "")
            log(f"  {k}: r={v['pearson_r']:+.3f} (p={v['pearson_p']:.4f}) {sig}")

    # --- 4.2 Granger Causality Tests ---
    log("Running Granger causality tests...")
    try:
        from statsmodels.tsa.stattools import grangercausalitytests
        granger_results = {}

        key_pairs = [
            ('btc_return_1d', 'surge_mean_pnl'),
            ('btc_return_1d', 'surge_count'),
            ('btc_return_1d', 'surge_win_rate'),
            ('btc_volatility_7d', 'surge_mean_pnl'),
            ('btc_volatility_7d', 'surge_count'),
            ('alt_pct_positive', 'surge_mean_pnl'),
            ('alt_pct_pump_1pct', 'surge_count'),
            ('market_volume_ratio', 'surge_mean_pnl'),
            ('btc_trend_score', 'surge_win_rate'),
        ]

        for cause_col, effect_col in key_pairs:
            if cause_col not in merged_df.columns or effect_col not in merged_df.columns:
                continue
            clean = merged_df[[effect_col, cause_col]].dropna()
            if len(clean) < 30:
                continue
            try:
                gc = grangercausalitytests(clean, maxlag=5, verbose=False)
                # Get min p-value across lags
                min_p = min(gc[lag][0]['ssr_ftest'][1] for lag in range(1, 6))
                best_lag = min(range(1, 6), key=lambda l: gc[l][0]['ssr_ftest'][1])
                granger_results[f"{cause_col} → {effect_col}"] = {
                    'min_p_value': round(min_p, 6),
                    'best_lag': best_lag,
                    'significant': min_p < 0.05,
                    'interpretation': (
                        f"Le marché ({cause_col}) PRÉDIT {'significativement' if min_p < 0.05 else 'PAS significativement'} "
                        f"le spy ({effect_col}) avec un lag de {best_lag} jours"
                    )
                }
                sig_str = "✅ SIGNIFICATIF" if min_p < 0.05 else "❌ Non significatif"
                log(f"  {cause_col} → {effect_col}: p={min_p:.4f} lag={best_lag}d {sig_str}")
            except Exception as e:
                log(f"  Granger test failed for {cause_col}→{effect_col}: {e}", "WARN")

        results['granger_causality'] = granger_results
    except ImportError:
        log("statsmodels not available for Granger tests", "WARN")
        results['granger_causality'] = {}

    # --- 4.3 Market Regime Analysis ---
    log("Analyzing performance by market regime...")
    if 'market_regime' in merged_df.columns and 'surge_mean_pnl' in merged_df.columns:
        regime_stats = {}
        for regime in ['BULL', 'NEUTRAL', 'CORRECTION', 'BEAR']:
            mask = merged_df['market_regime'] == regime
            subset = merged_df[mask]
            if len(subset) < 5:
                continue
            regime_stats[regime] = {
                'days': int(len(subset)),
                'pct_of_total': round(len(subset) / len(merged_df) * 100, 1),
                'avg_surge_count': round(subset['surge_count'].mean(), 1) if 'surge_count' in subset else 0,
                'avg_surge_pnl': round(subset['surge_mean_pnl'].mean(), 3) if 'surge_mean_pnl' in subset else 0,
                'avg_total_pnl': round(subset['surge_total_pnl'].mean(), 1) if 'surge_total_pnl' in subset else 0,
                'avg_win_rate': round(subset['surge_win_rate'].mean(), 1) if 'surge_win_rate' in subset else 0,
                'avg_surge_magnitude': round(subset['surge_mean_magnitude'].mean(), 2) if 'surge_mean_magnitude' in subset else 0,
                'btc_avg_return': round(subset['btc_return_1d'].mean(), 2) if 'btc_return_1d' in subset else 0,
            }
            log(f"  {regime}: {regime_stats[regime]['days']}d, "
                f"surges/d={regime_stats[regime]['avg_surge_count']:.0f}, "
                f"PnL/surge={regime_stats[regime]['avg_surge_pnl']:+.2f}%, "
                f"WR={regime_stats[regime]['avg_win_rate']:.0f}%")

        results['regime_analysis'] = regime_stats

    # --- 4.4 Volatility regime analysis ---
    log("Analyzing by volatility regime...")
    if 'btc_volatility_7d' in merged_df.columns and 'surge_mean_pnl' in merged_df.columns:
        vol_clean = merged_df[['btc_volatility_7d', 'surge_mean_pnl', 'surge_count',
                                'surge_win_rate', 'surge_total_pnl']].dropna()
        if len(vol_clean) > 20:
            vol_q = vol_clean['btc_volatility_7d'].quantile([0.25, 0.5, 0.75])
            vol_regimes = {}
            labels = ['Low Vol', 'Med-Low Vol', 'Med-High Vol', 'High Vol']
            bounds = [-np.inf, vol_q[0.25], vol_q[0.5], vol_q[0.75], np.inf]
            for i, label in enumerate(labels):
                mask = (vol_clean['btc_volatility_7d'] > bounds[i]) & (vol_clean['btc_volatility_7d'] <= bounds[i+1])
                subset = vol_clean[mask]
                if len(subset) < 3:
                    continue
                vol_regimes[label] = {
                    'days': int(len(subset)),
                    'vol_range': f"{bounds[i]:.2f} - {bounds[i+1]:.2f}",
                    'avg_surge_count': round(subset['surge_count'].mean(), 1),
                    'avg_surge_pnl': round(subset['surge_mean_pnl'].mean(), 3),
                    'avg_total_pnl': round(subset['surge_total_pnl'].mean(), 1),
                    'avg_win_rate': round(subset['surge_win_rate'].mean(), 1),
                }
                log(f"  {label}: surges/d={vol_regimes[label]['avg_surge_count']:.0f}, "
                    f"PnL={vol_regimes[label]['avg_surge_pnl']:+.3f}%, "
                    f"WR={vol_regimes[label]['avg_win_rate']:.0f}%")

            results['volatility_regime_analysis'] = vol_regimes

    # --- 4.5 Day of week analysis ---
    if 'day_of_week' in merged_df.columns and 'surge_mean_pnl' in merged_df.columns:
        dow_names = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']
        dow_stats = {}
        for d in range(7):
            subset = merged_df[merged_df['day_of_week'] == d]
            if len(subset) < 5:
                continue
            dow_stats[dow_names[d]] = {
                'avg_surge_count': round(subset['surge_count'].mean(), 1),
                'avg_pnl': round(subset['surge_mean_pnl'].mean(), 3),
                'avg_win_rate': round(subset['surge_win_rate'].mean(), 1),
                'avg_total_pnl': round(subset['surge_total_pnl'].mean(), 1),
            }
        results['day_of_week'] = dow_stats
        log(f"Day of week analysis complete: best day = {max(dow_stats, key=lambda x: dow_stats[x]['avg_pnl']) if dow_stats else 'N/A'}")

    # --- 4.6 Rolling correlation stability ---
    log("Computing rolling correlations (30d window)...")
    if 'btc_return_7d' in merged_df.columns and 'surge_mean_pnl' in merged_df.columns:
        rolling_corrs = []
        window = 30
        clean = merged_df[['btc_return_7d', 'surge_mean_pnl']].dropna()
        for i in range(window, len(clean)):
            chunk = clean.iloc[i-window:i]
            r, _ = scipy_stats.pearsonr(chunk['btc_return_7d'], chunk['surge_mean_pnl'])
            rolling_corrs.append({
                'date': str(clean.index[i].date()),
                'correlation': round(r, 4)
            })
        results['rolling_correlation_btc_vs_pnl'] = rolling_corrs
        if rolling_corrs:
            corr_vals = [c['correlation'] for c in rolling_corrs]
            log(f"  Rolling corr(BTCtrend, SpyPnL): mean={np.mean(corr_vals):+.3f}, "
                f"std={np.std(corr_vals):.3f}, "
                f"pct_positive={sum(1 for c in corr_vals if c > 0)/len(corr_vals)*100:.0f}%")

    return results


# ═══════════════════════════════════════════════════════════════════════
# PHASE 5: MACHINE LEARNING ANALYSIS
# ═══════════════════════════════════════════════════════════════════════

def ml_analysis(merged_df):
    """XGBoost + RandomForest for feature importance and prediction."""
    log("=" * 60)
    log("PHASE 5: MACHINE LEARNING ANALYSIS")
    log("=" * 60)

    results = {}

    feature_cols = [
        'btc_return_1d', 'btc_return_3d', 'btc_return_7d', 'btc_return_14d',
        'btc_return_30d', 'btc_volatility_7d', 'btc_volatility_30d',
        'btc_volume_ratio', 'btc_rsi_14', 'btc_trend_score',
        'alt_mean_return_1d', 'alt_pct_positive', 'alt_pct_pump_1pct',
        'alt_pct_pump_3pct', 'alt_dispersion', 'alt_btc_divergence',
        'alt_mean_return_7d', 'alt_volatility_7d', 'market_volume_ratio',
        'day_of_week', 'btc_body_pct', 'btc_upper_wick', 'btc_lower_wick'
    ]

    available = [c for c in feature_cols if c in merged_df.columns]

    # --- 5.1 Predict surge_mean_pnl (regression) ---
    for target in ['surge_mean_pnl', 'surge_win_rate', 'surge_count', 'surge_total_pnl']:
        if target not in merged_df.columns:
            continue

        log(f"Training models for target: {target}")
        clean = merged_df[available + [target]].dropna()
        if len(clean) < 50:
            log(f"  Insufficient data ({len(clean)} rows), skipping", "WARN")
            continue

        X = clean[available]
        y = clean[target]

        # Time-based train/test split (80/20)
        split_idx = int(len(X) * 0.8)
        X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
        y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

        log(f"  Train: {len(X_train)} days, Test: {len(X_test)} days")

        target_results = {}

        # --- XGBoost ---
        try:
            import xgboost as xgb
            model_xgb = xgb.XGBRegressor(
                n_estimators=300,
                max_depth=5,
                learning_rate=0.05,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.1,
                reg_lambda=1.0,
                random_state=42,
                n_jobs=-1,
                verbosity=0
            )
            model_xgb.fit(X_train, y_train,
                         eval_set=[(X_test, y_test)],
                         verbose=False)

            y_pred_xgb = model_xgb.predict(X_test)

            # Metrics
            from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
            mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
            r2_xgb = r2_score(y_test, y_pred_xgb)
            rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

            # Feature importance
            imp = model_xgb.feature_importances_
            feat_imp_xgb = sorted(zip(available, imp), key=lambda x: x[1], reverse=True)

            target_results['xgboost'] = {
                'r2_score': round(r2_xgb, 4),
                'mae': round(mae_xgb, 4),
                'rmse': round(rmse_xgb, 4),
                'feature_importance_top10': [
                    {'feature': f, 'importance': round(float(i), 4)}
                    for f, i in feat_imp_xgb[:10]
                ],
                'prediction_direction_accuracy': round(
                    np.mean(np.sign(y_pred_xgb) == np.sign(y_test.values)) * 100, 1
                )
            }

            log(f"  XGBoost R²={r2_xgb:.3f}, MAE={mae_xgb:.4f}, Direction={target_results['xgboost']['prediction_direction_accuracy']:.0f}%")
            log(f"  Top features: {', '.join(f'{f}({i:.3f})' for f, i in feat_imp_xgb[:5])}")

        except Exception as e:
            log(f"  XGBoost error: {e}", "WARN")

        # --- Random Forest ---
        try:
            from sklearn.ensemble import RandomForestRegressor
            from sklearn.metrics import mean_absolute_error, r2_score

            model_rf = RandomForestRegressor(
                n_estimators=500,
                max_depth=8,
                min_samples_leaf=5,
                random_state=42,
                n_jobs=-1
            )
            model_rf.fit(X_train, y_train)
            y_pred_rf = model_rf.predict(X_test)

            mae_rf = mean_absolute_error(y_test, y_pred_rf)
            r2_rf = r2_score(y_test, y_pred_rf)
            rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

            imp_rf = model_rf.feature_importances_
            feat_imp_rf = sorted(zip(available, imp_rf), key=lambda x: x[1], reverse=True)

            target_results['random_forest'] = {
                'r2_score': round(r2_rf, 4),
                'mae': round(mae_rf, 4),
                'rmse': round(rmse_rf, 4),
                'feature_importance_top10': [
                    {'feature': f, 'importance': round(float(i), 4)}
                    for f, i in feat_imp_rf[:10]
                ],
                'prediction_direction_accuracy': round(
                    np.mean(np.sign(y_pred_rf) == np.sign(y_test.values)) * 100, 1
                )
            }

            log(f"  RandomForest R²={r2_rf:.3f}, MAE={mae_rf:.4f}")

        except Exception as e:
            log(f"  RandomForest error: {e}", "WARN")

        # --- Cross-validation ---
        try:
            from sklearn.model_selection import TimeSeriesSplit
            from sklearn.ensemble import GradientBoostingRegressor

            tscv = TimeSeriesSplit(n_splits=5)
            cv_scores = []
            for train_idx, test_idx in tscv.split(X):
                X_cv_train, X_cv_test = X.iloc[train_idx], X.iloc[test_idx]
                y_cv_train, y_cv_test = y.iloc[train_idx], y.iloc[test_idx]
                gb = GradientBoostingRegressor(
                    n_estimators=200, max_depth=4, learning_rate=0.05,
                    random_state=42
                )
                gb.fit(X_cv_train, y_cv_train)
                score = gb.score(X_cv_test, y_cv_test)
                cv_scores.append(score)

            target_results['cross_validation'] = {
                'method': 'TimeSeriesSplit (5 folds)',
                'r2_scores': [round(s, 4) for s in cv_scores],
                'mean_r2': round(np.mean(cv_scores), 4),
                'std_r2': round(np.std(cv_scores), 4),
            }
            log(f"  CV R² mean={np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")
        except Exception as e:
            log(f"  CV error: {e}", "WARN")

        results[target] = target_results

    # --- 5.2 Classification: Good day vs Bad day ---
    log("Training binary classifier: Good vs Bad spy day...")
    if 'surge_mean_pnl' in merged_df.columns:
        clean = merged_df[available + ['surge_mean_pnl']].dropna()
        if len(clean) >= 50:
            X = clean[available]
            y_cls = (clean['surge_mean_pnl'] > 0).astype(int)

            split_idx = int(len(X) * 0.8)
            X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
            y_train, y_test = y_cls.iloc[:split_idx], y_cls.iloc[split_idx:]

            try:
                from sklearn.ensemble import GradientBoostingClassifier
                from sklearn.metrics import classification_report, roc_auc_score

                clf = GradientBoostingClassifier(
                    n_estimators=300, max_depth=4, learning_rate=0.05,
                    random_state=42
                )
                clf.fit(X_train, y_train)
                y_pred_cls = clf.predict(X_test)
                y_prob = clf.predict_proba(X_test)[:, 1]

                accuracy = np.mean(y_pred_cls == y_test.values) * 100
                try:
                    auc = roc_auc_score(y_test, y_prob)
                except:
                    auc = 0

                feat_imp_cls = sorted(zip(available, clf.feature_importances_),
                                        key=lambda x: x[1], reverse=True)

                results['binary_classification'] = {
                    'target': 'surge_mean_pnl > 0 (Good day)',
                    'accuracy': round(accuracy, 1),
                    'auc_roc': round(auc, 4),
                    'baseline_accuracy': round(max(y_test.mean(), 1 - y_test.mean()) * 100, 1),
                    'feature_importance_top10': [
                        {'feature': f, 'importance': round(float(i), 4)}
                        for f, i in feat_imp_cls[:10]
                    ],
                    'train_positive_rate': round(y_train.mean() * 100, 1),
                    'test_positive_rate': round(y_test.mean() * 100, 1),
                }

                log(f"  Classification accuracy: {accuracy:.1f}% (baseline: {results['binary_classification']['baseline_accuracy']:.1f}%)")
                log(f"  AUC-ROC: {auc:.3f}")
                log(f"  Top predictive features: {', '.join(f'{f}({i:.3f})' for f, i in feat_imp_cls[:5])}")

            except Exception as e:
                log(f"  Classification error: {e}", "WARN")

    return results


# ═══════════════════════════════════════════════════════════════════════
# PHASE 6: ADVANCED ANALYSIS — OPTIMAL CONDITIONS
# ═══════════════════════════════════════════════════════════════════════

def advanced_analysis(merged_df, surges_df):
    """Advanced analysis: optimal conditions, conditional probabilities."""
    log("=" * 60)
    log("PHASE 6: ADVANCED ANALYSIS — Optimal Trading Conditions")
    log("=" * 60)

    results = {}

    # --- 6.1 Conditional analysis: When is spy most profitable? ---
    log("Finding optimal market conditions for spy trading...")

    conditions = {}
    if 'surge_mean_pnl' not in merged_df.columns:
        return results

    # BTC trend conditions
    if 'btc_return_7d' in merged_df.columns:
        for label, lo, hi in [('BTC strong bear (<-5%)', -999, -5),
                                ('BTC bear (-5% to -1%)', -5, -1),
                                ('BTC flat (-1% to +1%)', -1, 1),
                                ('BTC bull (+1% to +5%)', 1, 5),
                                ('BTC strong bull (>+5%)', 5, 999)]:
            mask = (merged_df['btc_return_7d'] >= lo) & (merged_df['btc_return_7d'] < hi)
            subset = merged_df[mask]
            if len(subset) < 5:
                continue
            conditions[label] = {
                'days': int(len(subset)),
                'avg_surge_pnl': round(subset['surge_mean_pnl'].mean(), 4),
                'avg_total_pnl': round(subset['surge_total_pnl'].mean(), 2),
                'avg_win_rate': round(subset['surge_win_rate'].mean(), 1),
                'avg_surge_count': round(subset['surge_count'].mean(), 1),
            }
            log(f"  {label}: {conditions[label]['days']}d, "
                f"PnL/surge={conditions[label]['avg_surge_pnl']:+.3f}%, "
                f"WR={conditions[label]['avg_win_rate']:.0f}%")

    results['btc_trend_conditions'] = conditions

    # RSI conditions
    rsi_conditions = {}
    if 'btc_rsi_14' in merged_df.columns:
        for label, lo, hi in [('Oversold (<30)', 0, 30),
                                ('Weak (30-40)', 30, 40),
                                ('Neutral (40-60)', 40, 60),
                                ('Strong (60-70)', 60, 70),
                                ('Overbought (>70)', 70, 100)]:
            mask = (merged_df['btc_rsi_14'] >= lo) & (merged_df['btc_rsi_14'] < hi)
            subset = merged_df[mask]
            if len(subset) < 5:
                continue
            rsi_conditions[label] = {
                'days': int(len(subset)),
                'avg_surge_pnl': round(subset['surge_mean_pnl'].mean(), 4),
                'avg_win_rate': round(subset['surge_win_rate'].mean(), 1),
                'avg_surge_count': round(subset['surge_count'].mean(), 1),
            }
    results['rsi_conditions'] = rsi_conditions

    # Volume conditions
    vol_conditions = {}
    if 'market_volume_ratio' in merged_df.columns:
        for label, lo, hi in [('Very Low Vol (<0.7x)', 0, 0.7),
                                ('Low Vol (0.7-0.9x)', 0.7, 0.9),
                                ('Normal Vol (0.9-1.1x)', 0.9, 1.1),
                                ('High Vol (1.1-1.5x)', 1.1, 1.5),
                                ('Very High Vol (>1.5x)', 1.5, 999)]:
            mask = (merged_df['market_volume_ratio'] >= lo) & (merged_df['market_volume_ratio'] < hi)
            subset = merged_df[mask]
            if len(subset) < 5:
                continue
            vol_conditions[label] = {
                'days': int(len(subset)),
                'avg_surge_pnl': round(subset['surge_mean_pnl'].mean(), 4),
                'avg_win_rate': round(subset['surge_win_rate'].mean(), 1),
            }
    results['volume_conditions'] = vol_conditions

    # --- 6.2 Surge profitability by hour ---
    if not surges_df.empty and 'hour' in surges_df.columns:
        log("Analyzing surge profitability by hour of day...")
        hourly_stats = {}
        for h in range(24):
            subset = surges_df[surges_df['hour'] == h]
            if len(subset) < 10:
                continue
            hourly_stats[f"{h:02d}h"] = {
                'total_surges': int(len(subset)),
                'avg_pnl': round(subset['exit_pnl_pct'].mean(), 4),
                'win_rate': round(subset['is_win'].mean() * 100, 1),
                'avg_magnitude': round(subset['surge_pct'].mean(), 2),
                'best_type': subset['surge_type'].value_counts().index[0],
            }
        results['hourly_analysis'] = hourly_stats
        if hourly_stats:
            best_hour = max(hourly_stats, key=lambda x: hourly_stats[x]['avg_pnl'])
            worst_hour = min(hourly_stats, key=lambda x: hourly_stats[x]['avg_pnl'])
            log(f"  Best hour: {best_hour} (PnL={hourly_stats[best_hour]['avg_pnl']:+.3f}%)")
            log(f"  Worst hour: {worst_hour} (PnL={hourly_stats[worst_hour]['avg_pnl']:+.3f}%)")

    # --- 6.3 Surge type performance ---
    if not surges_df.empty:
        log("Analyzing by surge type...")
        type_stats = {}
        for stype in ['FLASH_SURGE', 'BREAKOUT_SURGE', 'MOMENTUM_SURGE']:
            subset = surges_df[surges_df['surge_type'] == stype]
            if len(subset) < 10:
                continue
            type_stats[stype] = {
                'total': int(len(subset)),
                'pct_of_all': round(len(subset) / len(surges_df) * 100, 1),
                'avg_pnl': round(subset['exit_pnl_pct'].mean(), 4),
                'median_pnl': round(subset['exit_pnl_pct'].median(), 4),
                'win_rate': round(subset['is_win'].mean() * 100, 1),
                'avg_max_pnl': round(subset['max_pnl_pct'].mean(), 3),
                'avg_magnitude': round(subset['surge_pct'].mean(), 2),
                'avg_hold': round(subset['hold_candles'].mean(), 1),
                'sl_rate': round((subset['exit_reason'] == 'HARD_SL').mean() * 100, 1),
            }
            log(f"  {stype}: {type_stats[stype]['total']} surges, "
                f"PnL={type_stats[stype]['avg_pnl']:+.3f}%, "
                f"WR={type_stats[stype]['win_rate']:.0f}%, "
                f"SL={type_stats[stype]['sl_rate']:.0f}%")
        results['surge_type_analysis'] = type_stats

    # --- 6.4 Composite "spy quality" score derivation ---
    log("Deriving composite spy quality score...")
    if all(c in merged_df.columns for c in ['btc_return_7d', 'btc_volatility_7d', 'alt_pct_positive', 'surge_mean_pnl']):
        # Find which features best predict good days
        from sklearn.preprocessing import StandardScaler

        feature_set = ['btc_return_7d', 'btc_volatility_7d', 'alt_pct_positive',
                       'btc_rsi_14', 'alt_pct_pump_1pct', 'market_volume_ratio']
        available_feats = [f for f in feature_set if f in merged_df.columns]
        clean = merged_df[available_feats + ['surge_mean_pnl']].dropna()

        if len(clean) > 30:
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(clean[available_feats])

            # Compute weighted composite score
            correlations_to_pnl = {}
            for i, feat in enumerate(available_feats):
                r, p = scipy_stats.pearsonr(X_scaled[:, i], clean['surge_mean_pnl'])
                correlations_to_pnl[feat] = {'r': r, 'p': p}

            # Weight features by |correlation|
            weights = {f: abs(v['r']) for f, v in correlations_to_pnl.items()}
            total_w = sum(weights.values())
            if total_w > 0:
                normalized_weights = {f: w / total_w for f, w in weights.items()}

                results['spy_quality_score'] = {
                    'formula': 'weighted_sum(normalized_features, correlation_weights)',
                    'feature_weights': {f: round(w, 4) for f, w in normalized_weights.items()},
                    'feature_correlations': {
                        f: {'r': round(v['r'], 4), 'p': round(v['p'], 6)}
                        for f, v in correlations_to_pnl.items()
                    },
                    'interpretation': (
                        'Score composite pour évaluer la qualité du marché pour le spy. '
                        'Score > 0 = conditions favorables. Score < 0 = conditions défavorables.'
                    )
                }

    return results


# ═══════════════════════════════════════════════════════════════════════
# MAIN: RUN COMPLETE ANALYSIS
# ═══════════════════════════════════════════════════════════════════════

def main():
    start_time = time.time()

    log("╔" + "═" * 58 + "╗")
    log("║  ANALYSE PROFONDE: CORRÉLATION SPY vs MARCHÉ CRYPTO     ║")
    log("║  1 an de données — ML + Statistiques avancées           ║")
    log("╚" + "═" * 58 + "╝")
    log("")

    # Phase 1: Collect data
    symbols = get_top_symbols()
    daily_data, hourly_data = collect_all_data(symbols)

    if not daily_data:
        log("No data collected! Aborting.", "ERROR")
        return

    # Phase 2: Feature engineering
    market_features = compute_market_features(daily_data)
    if market_features.empty:
        log("Feature engineering failed!", "ERROR")
        return

    # Phase 3: Surge simulation
    surges_df = simulate_surges_hourly(hourly_data)
    daily_surges = aggregate_daily_surges(surges_df)

    # Merge market features with surge data
    if not daily_surges.empty:
        merged = market_features.join(daily_surges, how='inner')
        log(f"Merged dataset: {len(merged)} days with both market + surge data")
    else:
        log("No surge data! Using market features only.", "WARN")
        merged = market_features

    # Phase 4: Statistical analysis
    stat_results = statistical_analysis(merged)

    # Phase 5: ML analysis
    ml_results = ml_analysis(merged)

    # Phase 6: Advanced analysis
    adv_results = advanced_analysis(merged, surges_df)

    # ═══════════════════════════════════════════════════════════════════
    # COMPILE FINAL RESULTS
    # ═══════════════════════════════════════════════════════════════════
    elapsed = time.time() - start_time

    final_results = {
        'metadata': {
            'analysis_date': datetime.now().isoformat(),
            'computation_time_seconds': round(elapsed, 1),
            'symbols_analyzed': len(daily_data),
            'hourly_symbols': len(hourly_data),
            'total_surges_simulated': len(surges_df) if not surges_df.empty else 0,
            'data_days': len(market_features),
            'merged_days': len(merged),
            'lookback_days': LOOKBACK_DAYS,
            'date_range': {
                'start': str(market_features.index.min().date()) if len(market_features) > 0 else None,
                'end': str(market_features.index.max().date()) if len(market_features) > 0 else None,
            }
        },
        'statistical_analysis': stat_results,
        'ml_analysis': ml_results,
        'advanced_analysis': adv_results,
        'data_summary': {
            'surge_stats': {
                'total_surges': len(surges_df) if not surges_df.empty else 0,
                'avg_per_day': round(len(surges_df) / max(len(daily_surges), 1), 1) if not surges_df.empty else 0,
                'overall_win_rate': round(surges_df['is_win'].mean() * 100, 1) if not surges_df.empty else 0,
                'overall_mean_pnl': round(surges_df['exit_pnl_pct'].mean(), 4) if not surges_df.empty else 0,
                'overall_median_pnl': round(surges_df['exit_pnl_pct'].median(), 4) if not surges_df.empty else 0,
            },
            'market_stats': {
                'btc_total_return': round(market_features['btc_return_1d'].sum(), 1) if 'btc_return_1d' in market_features else 0,
                'avg_daily_vol_7d': round(market_features['btc_volatility_7d'].mean(), 3) if 'btc_volatility_7d' in market_features else 0,
            }
        }
    }

    # ═══════════════════════════════════════════════════════════════════
    # GENERATE CONCLUSIONS
    # ═══════════════════════════════════════════════════════════════════
    conclusions = generate_conclusions(final_results, stat_results, ml_results, adv_results)
    final_results['conclusions'] = conclusions

    # Save results
    with open(RESULTS_FILE, 'w') as f:
        json.dump(final_results, f, indent=2, default=str)
    log(f"\nResults saved to {RESULTS_FILE}")

    # Print summary
    print_summary(final_results)

    log(f"\n✅ Analysis complete in {elapsed:.0f} seconds ({elapsed/60:.1f} minutes)")


def generate_conclusions(results, stat_results, ml_results, adv_results):
    """Generate actionable conclusions from all analyses."""
    conclusions = {
        'hypothesis_validated': False,
        'confidence_level': 'LOW',
        'key_findings': [],
        'exploitable_signals': [],
        'recommendations': [],
        'risks': [],
    }

    # Check correlation significance
    corrs = stat_results.get('correlations', {})
    significant_corrs = {k: v for k, v in corrs.items() if v.get('significant_5pct')}
    very_significant = {k: v for k, v in corrs.items() if v.get('significant_1pct')}

    if len(significant_corrs) > 5:
        conclusions['hypothesis_validated'] = True
        if len(very_significant) > 3:
            conclusions['confidence_level'] = 'HIGH'
        else:
            conclusions['confidence_level'] = 'MEDIUM'

    # Key findings from correlations
    for k, v in list(corrs.items())[:10]:
        if abs(v['pearson_r']) >= 0.15:
            direction = "positive" if v['pearson_r'] > 0 else "négative"
            conclusions['key_findings'].append(
                f"Corrélation {direction} {k}: r={v['pearson_r']:+.3f} (p={v['pearson_p']:.4f})"
            )

    # Granger causality findings
    granger = stat_results.get('granger_causality', {})
    for k, v in granger.items():
        if v.get('significant'):
            conclusions['key_findings'].append(
                f"Causalité Granger: {k} (lag={v['best_lag']}j, p={v['min_p_value']:.4f})"
            )

    # ML findings
    for target, models in ml_results.items():
        if isinstance(models, dict):
            for model_name, metrics in models.items():
                if isinstance(metrics, dict) and 'r2_score' in metrics:
                    if metrics['r2_score'] > 0.1:
                        conclusions['key_findings'].append(
                            f"ML: {model_name} prédit {target} avec R²={metrics['r2_score']:.3f}"
                        )

    # Exploitable signals from regime analysis
    regime = stat_results.get('regime_analysis', {})
    if regime:
        best_regime = max(regime, key=lambda x: regime[x].get('avg_surge_pnl', -999))
        worst_regime = min(regime, key=lambda x: regime[x].get('avg_surge_pnl', 999))
        conclusions['exploitable_signals'].append(
            f"Régime optimal: {best_regime} (PnL/surge={regime[best_regime]['avg_surge_pnl']:+.3f}%, "
            f"WR={regime[best_regime]['avg_win_rate']:.0f}%)"
        )
        conclusions['exploitable_signals'].append(
            f"Régime risqué: {worst_regime} (PnL/surge={regime[worst_regime]['avg_surge_pnl']:+.3f}%, "
            f"WR={regime[worst_regime]['avg_win_rate']:.0f}%)"
        )

    # BTC trend conditions
    btc_conds = adv_results.get('btc_trend_conditions', {})
    if btc_conds:
        best_cond = max(btc_conds, key=lambda x: btc_conds[x].get('avg_surge_pnl', -999))
        conclusions['exploitable_signals'].append(
            f"Meilleure condition BTC: {best_cond} → PnL={btc_conds[best_cond]['avg_surge_pnl']:+.3f}%"
        )

    # Hourly analysis
    hourly = adv_results.get('hourly_analysis', {})
    if hourly:
        best_hours = sorted(hourly, key=lambda x: hourly[x]['avg_pnl'], reverse=True)[:3]
        pnl_strs = [f"{hourly[h]['avg_pnl']:+.3f}%" for h in best_hours]
        conclusions['exploitable_signals'].append(
            f"Meilleures heures: {', '.join(best_hours)} (PnL moyen: {', '.join(pnl_strs)})"
        )

    # Recommendations
    if conclusions['hypothesis_validated']:
        conclusions['recommendations'].append(
            "✅ La relation spy/marché EST statistiquement validée. "
            "Les conditions de marché PRÉDISENT significativement la performance du spy."
        )
        conclusions['recommendations'].append(
            "Intégrer le score T/R et les conditions de marché pour moduler "
            "l'agressivité du spy (taille des positions, seuils de surges)."
        )
    else:
        conclusions['recommendations'].append(
            "⚠️ La relation n'est PAS suffisamment forte pour être exploitée seule. "
            "Le spy fonctionne principalement sur la micro-structure du marché (surges), "
            "indépendamment de la tendance macro."
        )

    conclusions['risks'].append(
        "Analyse basée sur simulation horaire (pas sur les données réelles du spy 7s). "
        "Les résultats réels peuvent différer en raison des latences, du slippage, "
        "et de la granularité plus fine du spy."
    )

    return conclusions


def print_summary(results):
    """Print a formatted summary of the analysis."""
    print("\n" + "═" * 70)
    print("       RÉSUMÉ — ANALYSE PROFONDE SPY vs MARCHÉ CRYPTO")
    print("═" * 70)

    meta = results.get('metadata', {})
    print(f"\n📊 Données: {meta.get('symbols_analyzed', 0)} symboles, "
          f"{meta.get('data_days', 0)} jours, "
          f"{meta.get('total_surges_simulated', 0):,} surges simulés")
    print(f"📅 Période: {meta.get('date_range', {}).get('start', '?')} → {meta.get('date_range', {}).get('end', '?')}")
    print(f"⏱️  Durée: {meta.get('computation_time_seconds', 0):.0f}s")

    surge_stats = results.get('data_summary', {}).get('surge_stats', {})
    print(f"\n🎯 Surges: {surge_stats.get('total_surges', 0):,} total, "
          f"{surge_stats.get('avg_per_day', 0):.0f}/jour, "
          f"WR={surge_stats.get('overall_win_rate', 0):.1f}%, "
          f"PnL moyen={surge_stats.get('overall_mean_pnl', 0):+.4f}%")

    conclusions = results.get('conclusions', {})
    validated = conclusions.get('hypothesis_validated', False)
    confidence = conclusions.get('confidence_level', 'LOW')
    status = "✅ VALIDÉE" if validated else "❌ NON VALIDÉE"
    print(f"\n{'='*50}")
    print(f"  HYPOTHÈSE: Corrélation Spy/Marché = {status}")
    print(f"  Niveau de confiance: {confidence}")
    print(f"{'='*50}")

    findings = conclusions.get('key_findings', [])
    if findings:
        print(f"\n📋 Découvertes clés ({len(findings)}):")
        for f in findings[:10]:
            print(f"   • {f}")

    signals = conclusions.get('exploitable_signals', [])
    if signals:
        print(f"\n💡 Signaux exploitables ({len(signals)}):")
        for s in signals:
            print(f"   • {s}")

    recs = conclusions.get('recommendations', [])
    if recs:
        print(f"\n🎯 Recommandations:")
        for r in recs:
            print(f"   • {r}")

    risks = conclusions.get('risks', [])
    if risks:
        print(f"\n⚠️  Risques:")
        for r in risks:
            print(f"   • {r}")

    # ML performance summary
    ml = results.get('ml_analysis', {})
    if ml:
        print(f"\n🤖 Performance ML:")
        for target, models in ml.items():
            if isinstance(models, dict):
                for model_name, metrics in models.items():
                    if isinstance(metrics, dict) and 'r2_score' in metrics:
                        print(f"   {model_name} → {target}: "
                              f"R²={metrics['r2_score']:.3f}, "
                              f"MAE={metrics.get('mae', '?')}, "
                              f"Direction={metrics.get('prediction_direction_accuracy', '?')}%")

    print("\n" + "═" * 70)
    print(f"  Résultats complets sauvegardés dans: {RESULTS_FILE}")
    print("═" * 70)


if __name__ == '__main__':
    main()
