Dimension Reduction

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

pd.options.display.float_format = "{:,.4f}".format
plt.style.use('seaborn-v0_8')
from cmds.portfolio import *
# Load S&P 500 individual stock returns
spx_file = '../data/spx_returns_weekly.xlsx'
stock_rets = pd.read_excel(spx_file, 's&p500 rets').set_index('date')
bench_rets = pd.read_excel(spx_file, 'benchmark rets').set_index('date')

# Load sector ETF returns
sector_file = '../data/sector_etf_data.xlsx'
sector_rets = pd.read_excel(sector_file, 'total returns').set_index('date')

print(f"Stock returns shape: {stock_rets.shape}")
print(f"Sector returns shape: {sector_rets.shape}")
print(f"Benchmark returns shape: {bench_rets.shape}")
Stock returns shape: (542, 442)
Sector returns shape: (380, 11)
Benchmark returns shape: (542, 7)
# Align all datasets to common dates
common_dates = stock_rets.index.intersection(sector_rets.index).intersection(bench_rets.index)

# Extract aligned data
stocks = stock_rets.loc[common_dates]
sectors = sector_rets.loc[common_dates]
benchmarks = bench_rets.loc[common_dates]

# Combine sector and benchmark factors (excluding overlapping ETFs)
benchmarks_clean = benchmarks.drop(columns=['SPY','TLT','IYR','BTC'], errors='ignore')
factors = pd.concat([sectors, benchmarks_clean], axis=1)

# Display final dataset information
final_dataset_info = pd.DataFrame({
    'Metric': ['Common Dates', 'Number of Stocks', 'Number of Factors', 'Date Range'],
    'Value': [
        f"{len(common_dates)} observations",
        f"{stocks.shape[1]} stocks", 
        f"{factors.shape[1]} factors",
        f"{common_dates[0].strftime('%Y-%m-%d')} to {common_dates[-1].strftime('%Y-%m-%d')}"
    ]
})
final_dataset_info
Metric Value
0 Common Dates 361 observations
1 Number of Stocks 442 stocks
2 Number of Factors 14 factors
3 Date Range 2018-06-29 to 2025-05-23
TICKS = [
    'AAPL',
    'BRK/B',
    'NVDA',
    'TSLA',
    'V',
    'C',
    'LLY',
    'AMZN',
]
# Get OLS metrics for factor decomposition
ols_metrics = get_ols_metrics(factors, stocks, annualization=52)
ols_metrics.loc[TICKS].style.format('{:.1%}',na_rep='-')
  alpha XLK XLI XLF XLC XLRE XLE XLY XLB XLV XLU XLP USO IEF GLD r-squared Info Ratio
AAPL 7.5% 102.2% -25.2% -5.0% 2.0% 8.3% -1.5% 1.3% -10.8% -1.1% 6.5% 36.1% 0.7% -15.2% -9.1% 68.3% 46.3%
BRK/B 7.2% 2.2% 5.4% 66.4% 2.9% -9.4% 3.9% -18.2% -0.2% 15.1% 0.3% 18.7% 0.1% 12.5% -3.3% 74.8% 69.5%
NVDA 22.9% 179.2% 6.7% -16.7% -24.0% -44.5% -6.2% 48.4% 13.5% 1.9% 9.5% -57.3% 2.4% 18.5% 8.2% 68.8% 83.6%
TSLA 43.7% 17.8% -84.6% -70.9% -60.4% -9.9% 16.6% 300.0% 16.3% -24.2% 29.7% 2.9% 13.0% -67.8% 2.4% 53.4% 97.1%
V 4.9% 29.5% -7.5% 42.7% 16.4% 12.6% -1.3% -7.2% -6.4% 8.4% -9.6% 27.2% -2.1% 13.4% -4.8% 62.2% 33.1%
C -4.6% 0.6% 1.1% 146.3% 9.7% -6.0% 8.1% 0.8% -5.0% -22.6% 3.6% -21.9% -1.7% 12.8% 3.9% 81.0% -28.7%
LLY 29.6% 28.3% -12.9% -16.1% -12.6% 4.3% 16.0% -19.9% -20.5% 140.8% 0.3% -16.3% -0.6% 12.9% -3.1% 39.3% 122.0%
AMZN 0.0% 33.2% -44.3% 0.1% 34.3% -27.6% -0.1% 108.0% -17.8% 10.4% -7.9% -5.9% -1.3% 13.1% 15.7% 67.8% 0.1%
# Perform factor decomposition for each stock
USE_INTERCEPT = True

# Initialize results DataFrames
betas = pd.DataFrame(index=stocks.columns, columns=factors.columns, dtype=float)
residuals = pd.DataFrame(index=stocks.index, columns=stocks.columns, dtype=float)
r_squared = pd.Series(index=stocks.columns, dtype=float)

# Prepare factor matrix (with intercept if needed)
X = factors.copy()
if USE_INTERCEPT:
    X = sm.add_constant(X)
    betas.insert(0, 'alpha', 0)

# Run regressions for each stock
for stock in stocks.columns:
    y = stocks[stock]
    results = sm.OLS(y, X).fit()
    betas.loc[stock] = results.params
    residuals[stock] = results.resid
    r_squared[stock] = results.rsquared

# Display regression summary statistics
regression_summary = pd.DataFrame({
    'value': [
        f"{r_squared.mean():.3f}",
        f"{r_squared.median():.3f}",
        f"{r_squared.min():.3f}",
        f"{r_squared.max():.3f}",
        f"{r_squared.std():.3f}"
    ]
}, index=['Average R2', 'Median R2', 'Min R2', 'Max R2', 'Std R2'])

# Find stocks with min and max R2
min_r2_stock = r_squared.idxmin()
max_r2_stock = r_squared.idxmax()

r2_extremes = pd.DataFrame({
    'Stock': [min_r2_stock, max_r2_stock],
    'R2': [f"{r_squared[min_r2_stock]:.3f}", f"{r_squared[max_r2_stock]:.3f}"]
}, index=['Min R2', 'Max R2'])

display(regression_summary)
display(r2_extremes)
value
Average R2 0.571
Median R2 0.570
Min R2 0.123
Max R2 0.899
Std R2 0.152
Stock R2
Min R2 ERIE 0.123
Max R2 XOM 0.899

Dimension Reduction#

nPCA = 3

# Compare PCA structure between original returns and residuals
pca_stocks = PCA()
pca_resids = PCA()

pca_stocks.fit(stocks)
pca_resids.fit(residuals)

# Calculate cumulative explained variance for first 5 components
stocks_cum_var = np.cumsum(pca_stocks.explained_variance_ratio_[:nPCA] * 100)
resids_cum_var = np.cumsum(pca_resids.explained_variance_ratio_[:nPCA] * 100)

# Create comparison DataFrame
pca_analysis = pd.DataFrame({
    'Stock Returns Explained (%)': [f"{var:.2f}%" for var in stocks_cum_var],
    'Residuals Explained (%)': [f"{var:.2f}%" for var in resids_cum_var]
}, index=[f'PC{i+1}' for i in range(nPCA)])

pca_analysis
Stock Returns Explained (%) Residuals Explained (%)
PC1 40.75% 4.81%
PC2 46.35% 8.26%
PC3 50.41% 11.37%
# Numerical measures of correlation structure
condition_numbers = pd.DataFrame({
    'Dataset': ['Original Returns', 'Residuals', 'Factors'],
    'Condition Number': [
        f"{np.linalg.cond(stocks.corr()):.1e}",
        f"{np.linalg.cond(residuals.corr()):.1e}", 
        f"{np.linalg.cond(factors.corr()):.1e}"
    ],
    'Determinant': [
        f"{np.linalg.det(stocks.corr()):.1e}",
        f"{np.linalg.det(residuals.corr()):.1e}",
        f"{np.linalg.det(factors.corr()):.1e}"
    ],
    'Average Correlation': [
        f"{stocks.corr().abs().mean().mean():.1%}",
        f"{residuals.corr().abs().mean().mean():.1%}",
        f"{factors.corr().abs().mean().mean():.1%}"
    ]
})

condition_numbers
Dataset Condition Number Determinant Average Correlation
0 Original Returns 1.2e+19 -0.0e+00 39.2%
1 Residuals 4.6e+18 0.0e+00 6.9%
2 Factors 1.0e+02 2.0e-06 50.9%
# Factor correlation heatmap
plt.figure(figsize=(12, 10))
corr_matrix = factors.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Factor Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
../_images/39df26bba8efd79d27a2055b8135c49a7ac20300636d92322bbae6b84a5c2ac2.png
# Residual correlation heatmap (sample of stocks for readability)
# Get 20 stocks spaced evenly through the list
sample_stocks = residuals.columns[::20][:20]  # Take every 20th stock, up to 20 stocks

plt.figure(figsize=(12, 10))
corr_matrix = residuals.loc[:, sample_stocks].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=False, fmt='.2f', cmap='RdBu_r', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Residual Correlation Matrix (20 Sample Stocks)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
../_images/8740c57afad6f7d143baf8c44af8ec8e1853e1f3090e10c67e6ec814003992be.png

Reduced Risk#

# PRIMARY RESULTS: Key Risk Metrics Comparison

primary_results = pd.DataFrame({
    'Metric': ['Volatility', 'Skewness', 'Kurtosis', 'Average Correlation'],
    'Original Returns': [
        stocks.std().mean(),
        stocks.skew().mean(), 
        stocks.kurtosis().mean(),
        stocks.corr().abs().mean().mean()
    ],
    'Residuals': [
        residuals.std().mean(),
        residuals.skew().mean(),
        residuals.kurtosis().mean(), 
        residuals.corr().abs().mean().mean()
    ],
})

primary_results.set_index('Metric', inplace=True)

primary_results.style.format({
    'Original Returns': '{:.1%}',
    'Residuals': '{:.1%}',
    'Improvement': '{:.4f}'
}, na_rep='-')

primary_results.style.format('{:.1%}',na_rep='-')
  Original Returns Residuals
Metric    
Volatility 4.6% 3.0%
Skewness 0.1% 6.0%
Kurtosis 546.3% 423.0%
Average Correlation 39.2% 6.9%
# Load benchmark returns
benchmark = pd.read_excel(spx_file, sheet_name='benchmark rets')
benchmark.set_index('date', inplace=True)

# Calculate metrics for both stocks and residuals
metrics = {}
for metric, func in [('Volatility', lambda x: x.std()), 
                    ('Skewness', lambda x: x.skew()),
                    ('SPY_Correlation', lambda x: pd.Series({col: x[col].corr(benchmark['SPY']) 
                                                           for col in x.columns}))]:
    # Calculate metric for both stocks and residuals
    stocks_metric = func(stocks)
    residuals_metric = func(residuals)
    
    # Create dataframe with describe statistics for this metric
    metric_df = pd.DataFrame({
        'stocks': stocks_metric.describe(),
        'residuals': residuals_metric.describe()
    })
    metrics[metric] = metric_df

# Combine into multilevel dataframe with metrics as top level
stats_df = pd.concat(metrics, axis=1)
stats_df.style.format('{:.1%}',na_rep='-')
  Volatility Skewness SPY_Correlation
  stocks residuals stocks residuals stocks residuals
count 44200.0% 44200.0% 44200.0% 44200.0% 44200.0% 44200.0%
mean 4.6% 3.0% 0.1% 6.0% 59.2% 0.1%
std 1.2% 1.0% 47.7% 70.9% 12.2% 0.5%
min 2.5% 1.2% -241.9% -617.9% 14.9% -1.7%
25% 3.7% 2.3% -26.1% -22.7% 52.4% -0.3%
50% 4.3% 2.8% -1.5% 11.0% 61.1% 0.1%
75% 5.1% 3.4% 23.7% 38.7% 67.6% 0.4%
max 11.1% 9.6% 196.7% 252.4% 82.5% 2.1%
# Summary table of normality improvements across all stocks
all_normality_stats = []

for stock in stocks.columns:
    # Calculate key statistics
    orig_skew = stocks[stock].skew()
    orig_kurt = stocks[stock].kurtosis()
    resid_skew = residuals[stock].skew()
    resid_kurt = residuals[stock].kurtosis()
    
    # Calculate improvement metrics
    skew_improvement = abs(resid_skew) - abs(orig_skew)  # Negative = improvement
    kurt_improvement = abs(resid_kurt) - abs(orig_kurt)  # Negative = improvement
    
    all_normality_stats.append({
        'Stock': stock,
        'Orig_Skew': orig_skew,
        'Resid_Skew': resid_skew,
        'Skew_Change': skew_improvement,
        'Orig_Kurt': orig_kurt,
        'Resid_Kurt': resid_kurt,
        'Kurt_Change': kurt_improvement
    })

normality_summary = pd.DataFrame(all_normality_stats)

# Calculate summary statistics
normality_summary_stats = pd.DataFrame({
    'Metric': [
        'Stocks with Improved Skewness',
        'Stocks with Improved Kurtosis', 
        'Average Skewness Improvement',
        'Average Kurtosis Improvement'
    ],
    'Value': [
        f"{(normality_summary['Skew_Change'] < 0).sum()} / {len(normality_summary)}",
        f"{(normality_summary['Kurt_Change'] < 0).sum()} / {len(normality_summary)}",
        f"{normality_summary['Skew_Change'].mean():.4f}",
        f"{normality_summary['Kurt_Change'].mean():.4f}"
    ]
})
normality_summary_stats

# Show top 10 improvements
top_skew = normality_summary.nsmallest(10, 'Skew_Change')[['Stock', 'Orig_Skew', 'Resid_Skew', 'Skew_Change']]
top_skew
Stock Orig_Skew Resid_Skew Skew_Change
425 WELL 1.8789 0.1516 -1.7272
417 VTR 1.5470 0.2835 -1.2634
230 KIM 1.9671 0.7868 -1.1803
314 PAYX -1.0960 0.1423 -0.9537
391 TRGP -1.1758 -0.2274 -0.9484
327 PM -1.0759 -0.1405 -0.9354
122 DOC 1.0237 0.1174 -0.9063
126 DTE 0.7894 -0.0039 -0.7856
219 J -0.8223 -0.0422 -0.7801
234 KMI -0.7983 -0.0694 -0.7288
# Volatility comparison: Original vs Residuals
vols = pd.DataFrame({
    'Original': stocks.std() * np.sqrt(52),
    'Residuals': residuals.std() * np.sqrt(52)
})

fig, ax = plt.subplots(figsize=(10, 8))
x = vols['Original']
y = vols['Residuals']
ax.scatter(x, y, alpha=0.6, s=50)

# Add 45-degree line
min_val = min(min(x), min(y))
max_val = max(max(x), max(y))
ax.plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8, label='45° line')

ax.set_xlabel('Original Volatility (Annualized)', fontsize=12)
ax.set_ylabel('Residual Volatility (Annualized)', fontsize=12)
ax.set_title('Volatility Reduction Through Factor Decomposition', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Average volatility reduction: {(1 - vols['Residuals'].mean() / vols['Original'].mean()) * 100:.1f}%")
../_images/c6dec96434a406d324d9dcd9334123244c6d649d5462055242f11ccb7129a857.png
Average volatility reduction: 35.4%
# Skewness comparison: Original vs Residuals
skews = pd.DataFrame({
    'Original': stocks.skew(),
    'Residuals': residuals.skew()
})

fig, ax = plt.subplots(figsize=(10, 8))
x = skews['Original']
y = skews['Residuals']
ax.scatter(x, y, alpha=0.6, s=50)

# Add 45-degree line
min_val = min(min(x), min(y))
max_val = max(max(x), max(y))
ax.plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8, label='45° line')

ax.set_xlabel('Original Skewness', fontsize=12)
ax.set_ylabel('Residual Skewness', fontsize=12)
ax.set_title('Skewness: Original vs Residuals', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
../_images/a4bfbf82233906a1897219912feafe254b85b0f2b8079f3bc6582aa95df81483.png
# Kurtosis comparison: Original vs Residuals
kurts = pd.DataFrame({
    'Original': stocks.kurtosis(),
    'Residuals': residuals.kurtosis()
})

fig, ax = plt.subplots(figsize=(10, 8))
x = kurts['Original']
y = kurts['Residuals']
ax.scatter(x, y, alpha=0.6, s=50)

# Add 45-degree line
min_val = min(min(x), min(y))
max_val = max(max(x), max(y))
ax.plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8, label='45° line')

ax.set_xlabel('Original Kurtosis', fontsize=12)
ax.set_ylabel('Residual Kurtosis', fontsize=12)
ax.set_title('Kurtosis: Original vs Residuals', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
../_images/283f521a1a04354b1c78001df5840bb2dc67bfd6880b6157f4ea60295d6dab10.png

Normality#

# Statistical tests for normality improvement
from scipy.stats import shapiro, jarque_bera, normaltest

# Test normality for a sample of stocks
sample_stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA']
normality_tests = []

for stock in sample_stocks:
    if stock in stocks.columns:
        # Original returns
        orig_stats = {
            'Shapiro-Wilk p-value': shapiro(stocks[stock])[1],
            'Jarque-Bera p-value': jarque_bera(stocks[stock])[1],
            'D\'Agostino p-value': normaltest(stocks[stock])[1]
        }
        
        # Residuals
        resid_stats = {
            'Shapiro-Wilk p-value': shapiro(residuals[stock])[1],
            'Jarque-Bera p-value': jarque_bera(residuals[stock])[1],
            'D\'Agostino p-value': normaltest(residuals[stock])[1]
        }
        
        normality_tests.append({
            'Stock': stock,
            'Original_Shapiro': orig_stats['Shapiro-Wilk p-value'],
            'Residual_Shapiro': resid_stats['Shapiro-Wilk p-value'],
            'Original_JB': orig_stats['Jarque-Bera p-value'],
            'Residual_JB': resid_stats['Jarque-Bera p-value']
        })

normality_df = pd.DataFrame(normality_tests)
normality_df
Stock Original_Shapiro Residual_Shapiro Original_JB Residual_JB
0 AAPL 0.0001 0.0113 0.0000 0.0001
1 MSFT 0.0021 0.0000 0.0000 0.0000
2 GOOGL 0.0200 0.0000 0.0114 0.0000
3 AMZN 0.0015 0.0001 0.0002 0.0000
4 TSLA 0.0000 0.0000 0.0000 0.0000
# Q-Q plots for normality assessment
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Original returns Q-Q plot
stats.probplot(stocks['AAPL'], dist="norm", plot=ax1)
ax1.set_title("Original Returns Q-Q Plot (AAPL)", fontweight='bold')
ax1.grid(True, alpha=0.3)

# Residuals Q-Q plot
stats.probplot(residuals['AAPL'], dist="norm", plot=ax2)
ax2.set_title("Residuals Q-Q Plot (AAPL)", fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
../_images/21ec062b1fc367c3278ba7c105add13e1dd7a559bb74f7ff7a738cc4c4d9763e.png
# Distribution comparison: Original vs Residuals
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Original returns distribution
sns.histplot(stocks['AAPL'], kde=True, stat="density", bins=30, ax=ax1)
mean = stocks['AAPL'].mean()
std = stocks['AAPL'].std()
x = np.linspace(mean - 4 * std, mean + 4 * std, 100)
ax1.plot(x, stats.norm.pdf(x, mean, std), label="Normal Curve", color="red")
ax1.set_title("Original Returns Distribution (AAPL)", fontweight='bold')
ax1.legend()

# Residuals distribution
sns.histplot(residuals['AAPL'], kde=True, stat="density", bins=30, ax=ax2)
mean = residuals['AAPL'].mean()
std = residuals['AAPL'].std()
x = np.linspace(mean - 4 * std, mean + 4 * std, 100)
ax2.plot(x, stats.norm.pdf(x, mean, std), label="Normal Curve", color="red")
ax2.set_title("Residuals Distribution (AAPL)", fontweight='bold')
ax2.legend()

plt.tight_layout()
plt.show()
../_images/4207938610065ab664d5c4a147b04724613a56d45b4bc1bfa1b451e15ebac6a0.png
# Statistical tests for normality improvement
from scipy.stats import shapiro, jarque_bera, normaltest

# Test normality for a sample of stocks
sample_stocks = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA']
normality_tests = []

for stock in sample_stocks:
    if stock in stocks.columns:
        # Original returns
        orig_stats = {
            'Shapiro-Wilk p-value': shapiro(stocks[stock])[1],
            'Jarque-Bera p-value': jarque_bera(stocks[stock])[1],
            'D\'Agostino p-value': normaltest(stocks[stock])[1]
        }
        
        # Residuals
        resid_stats = {
            'Shapiro-Wilk p-value': shapiro(residuals[stock])[1],
            'Jarque-Bera p-value': jarque_bera(residuals[stock])[1],
            'D\'Agostino p-value': normaltest(residuals[stock])[1]
        }
        
        normality_tests.append({
            'Stock': stock,
            'Original_Shapiro': orig_stats['Shapiro-Wilk p-value'],
            'Residual_Shapiro': resid_stats['Shapiro-Wilk p-value'],
            'Original_JB': orig_stats['Jarque-Bera p-value'],
            'Residual_JB': resid_stats['Jarque-Bera p-value']
        })

normality_df = pd.DataFrame(normality_tests)
normality_df
Stock Original_Shapiro Residual_Shapiro Original_JB Residual_JB
0 AAPL 0.0001 0.0113 0.0000 0.0001
1 MSFT 0.0021 0.0000 0.0000 0.0000
2 GOOGL 0.0200 0.0000 0.0114 0.0000
3 AMZN 0.0015 0.0001 0.0002 0.0000
4 TSLA 0.0000 0.0000 0.0000 0.0000
# Quantify QQ plot improvements with summary statistics
qq_improvements = []

for stock in sample_stocks:
    if stock in stocks.columns:
        # Calculate theoretical quantiles for comparison
        orig_quantiles = np.percentile(stocks[stock], np.linspace(0.1, 99.9, 100))
        resid_quantiles = np.percentile(residuals[stock], np.linspace(0.1, 99.9, 100))
        
        # Theoretical normal quantiles
        theoretical_quantiles = stats.norm.ppf(np.linspace(0.1, 99.9, 100) / 100)
        
        # Calculate deviations from normality
        orig_deviation = np.mean(np.abs(orig_quantiles - theoretical_quantiles))
        resid_deviation = np.mean(np.abs(resid_quantiles - theoretical_quantiles))
        
        qq_improvements.append({
            'Stock': stock,
            'Original_Deviation': orig_deviation,
            'Residual_Deviation': resid_deviation,
            'Improvement_%': (1 - resid_deviation / orig_deviation) * 100
        })

qq_df = pd.DataFrame(qq_improvements)
qq_df
Stock Original_Deviation Residual_Deviation Improvement_%
0 AAPL 0.7883 0.8010 -1.6212
1 MSFT 0.7920 0.8061 -1.7901
2 GOOGL 0.7873 0.8012 -1.7632
3 AMZN 0.7852 0.7997 -1.8535
4 TSLA 0.7485 0.7736 -3.3470
# Visual comparison of normality improvements
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, stock in enumerate(sample_stocks[:6]):
    if stock in stocks.columns and i < 6:
        ax = axes[i]
        
        # Create side-by-side QQ plots
        stats.probplot(stocks[stock], dist="norm", plot=ax)
        ax.set_title(f'{stock} - Original Returns', fontweight='bold')
        ax.grid(True, alpha=0.3)
        
        # Add statistics as text
        orig_skew = stocks[stock].skew()
        orig_kurt = stocks[stock].kurtosis()
        ax.text(0.05, 0.95, f'Skew: {orig_skew:.2f}\nKurt: {orig_kurt:.2f}', 
                transform=ax.transAxes, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Add residuals QQ plots
fig2, axes2 = plt.subplots(2, 3, figsize=(18, 12))
axes2 = axes2.flatten()

for i, stock in enumerate(sample_stocks[:6]):
    if stock in residuals.columns and i < 6:
        ax = axes2[i]
        
        stats.probplot(residuals[stock], dist="norm", plot=ax)
        ax.set_title(f'{stock} - Residuals', fontweight='bold')
        ax.grid(True, alpha=0.3)
        
        # Add statistics as text
        resid_skew = residuals[stock].skew()
        resid_kurt = residuals[stock].kurtosis()
        ax.text(0.05, 0.95, f'Skew: {resid_skew:.2f}\nKurt: {resid_kurt:.2f}', 
                transform=ax.transAxes, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))

plt.tight_layout()
plt.show()
../_images/9bf63272f296e733c49e013b82aebcd87d8217a8a1a07a9bc56b3c78bb034ed6.png ../_images/914ce67a56f9c8a299884470423523be036b5dee7fadfd6aa8066da13d41dc45.png