MV of S&P 500

MV of S&P 500#

import numpy as np
import pandas as pd
pd.options.display.float_format = "{:,.4f}".format

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

filepath_data = '../data/spx_returns_weekly.xlsx'
SHEET = 's&p500 rets'
rets_raw = pd.read_excel(filepath_data,sheet_name=SHEET).set_index('date')

def delta_parameter(target_mean):
    delta = (target_mean - mu.T@w_gmv)/(mu.T@w_tan - mu.T@w_gmv)
    return delta

# Multi-Frontier Plot for Various N Values (Nested Subsamples)
N_values = [5, 20, 50, 100, 200, len(rets_raw.columns)]

fig, ax = plt.subplots(figsize=(10, 6))

# Create nested subsamples
np.random.seed(2025)  # Use same seed for reproducibility
all_assets = list(rets_raw.columns)
np.random.shuffle(all_assets)  # Shuffle once for consistent ordering

# Track which assets are included at each N level
asset_inclusion_level = {}
nested_asset_sets = {}

# Initialize summary data storage
summary_data = []

# Create nested sets
for i, N in enumerate(N_values):
    if N > len(rets_raw.columns):
        continue  # Skip if N is larger than available assets
    
    # For the first N, take the first N assets
    # For subsequent N values, take the previous assets plus additional ones
    if i == 0:
        selected_assets = all_assets[:N]
    else:
        # Take all assets from previous levels plus additional ones
        selected_assets = nested_asset_sets[N_values[i-1]] + all_assets[len(nested_asset_sets[N_values[i-1]]):N]
    
    nested_asset_sets[N] = selected_assets
    idx = sorted(selected_assets)
    rets_n = rets_raw[idx]
    
    # Track which N level each asset was first included in
    for asset in selected_assets:
        if asset not in asset_inclusion_level:
            asset_inclusion_level[asset] = N
    
    # Calculate metrics for this N
    mu_n = rets_n.mean() * FREQ
    covmat_n = rets_n.cov() * FREQ
    vec_ones_n = np.ones(mu_n.shape)
    
    # Calculate GMV and tangency portfolios
    w_gmv_n = np.linalg.solve(covmat_n, vec_ones_n)
    w_tan_n = np.linalg.solve(covmat_n, mu_n)
    w_gmv_n /= w_gmv_n.sum()
    w_tan_n /= w_tan_n.sum()
    
    # Calculate Sharpe ratios and gross market value for this N
    # Individual asset Sharpe ratios
    individual_sharpe = mu_n / (rets_n.std() * np.sqrt(FREQ))
    max_individual_sharpe = individual_sharpe.max()
    
    # Tangency portfolio Sharpe ratio
    tangency_sharpe = (mu_n.T @ w_tan_n) / np.sqrt(w_tan_n.T @ covmat_n @ w_tan_n)
    
    # Gross market value of tangency portfolio
    gross_market_value = np.abs(w_tan_n).sum()
    
    # Store summary data
    summary_data.append({
        'N': N,
        'Max_Individual_Sharpe': max_individual_sharpe,
        'Tangency_Sharpe': tangency_sharpe,
        'Gross_Market_Value': gross_market_value
    })
    
    # Define delta parameter function for this N
    def delta_parameter_n(target_mean):
        delta = (target_mean - mu_n.T@w_gmv_n)/(mu_n.T@w_tan_n - mu_n.T@w_gmv_n)
        return delta
    
    # Create frontier for this N
    MAX_n = (N+5) * .0005 * FREQ
    MIN_n = - MAX_n/1.6
    MAX_n = -1
    MIN_n = 1
    
    grid_n = np.linspace(MIN_n, MAX_n, 250)
    df_n = pd.DataFrame(index=grid_n, columns=['delta','mu','vol'], dtype=float)
    
    for mu_target in grid_n:
        delta = delta_parameter_n(mu_target)
        w = delta * w_tan_n + (1-delta) * w_gmv_n
        df_n.loc[mu_target,'delta'] = delta
        df_n.loc[mu_target,'mu'] = w.T @ mu_n
        df_n.loc[mu_target,'vol'] = np.sqrt(w.T @ covmat_n @ w)
    
    # Plot this frontier
    ax.plot(df_n['vol'], df_n['mu'], 
            linewidth=2, alpha=0.8, label=f'N={N}')

# Plot base assets colored by inclusion level
base_mu = rets_raw.mean() * FREQ
base_vol = rets_raw.std() * np.sqrt(FREQ)

# Create color mapping for assets based on their first inclusion level
for N in N_values:
    if N > len(rets_raw.columns):
        continue
    
    # Get assets that were first included at this N level
    assets_at_level = [asset for asset, level in asset_inclusion_level.items() if level == N]
    
    if assets_at_level:
        # Get the corresponding mu and vol for these assets
        mu_at_level = base_mu[assets_at_level]
        vol_at_level = base_vol[assets_at_level]
        
        # Plot with the color corresponding to this N level
        color_idx = N_values.index(N)
        ax.scatter(vol_at_level, mu_at_level, 
                  alpha=0.7, s=40, 
                  zorder=5, edgecolors='black', linewidth=0.5)

# Add vertical line at the global minimum variance portfolio of the largest N
largest_N = N_values[-1]
if largest_N <= len(rets_raw.columns):
    # Get the assets for the largest N
    largest_assets = nested_asset_sets[largest_N]
    largest_rets = rets_raw[largest_assets]
    
    # Calculate GMV portfolio for largest N
    largest_mu = largest_rets.mean() * FREQ
    largest_covmat = largest_rets.cov() * FREQ
    largest_vec_ones = np.ones(largest_mu.shape)
    largest_w_gmv = np.linalg.solve(largest_covmat, largest_vec_ones)
    largest_w_gmv /= largest_w_gmv.sum()
    
    # Calculate volatility of GMV portfolio
    gmv_vol = np.sqrt(largest_w_gmv.T @ largest_covmat @ largest_w_gmv)
    
    # Add vertical line
    ax.axvline(x=gmv_vol, color='black', linestyle='--', linewidth=2)
    
    # Add text label on x-axis
    ax.text(gmv_vol, ax.get_ylim()[0] + 0.02 * (ax.get_ylim()[1] - ax.get_ylim()[0]), 
            f'GMV Vol: {gmv_vol:.3f}', 
            rotation=90, verticalalignment='bottom', horizontalalignment='right',
            fontsize=10, bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))

ax.set_xlabel('Volatility', fontsize=16)
ax.set_ylabel('Expected Return', fontsize=16)
ax.set_title('Mean-Variance Frontiers', fontsize=18)
ax.legend(fontsize=14)
ax.grid(True, alpha=0.3)

# Increase tick label font sizes
ax.tick_params(axis='x', labelsize=14)
ax.tick_params(axis='y', labelsize=14)

plt.tight_layout()
plt.show()

../_images/292d2528b37b676628eca8211073f940d1b42c35f28c7f7c10d96cb9b31f0d48.png

# Summary Table: Sharpe Ratios and Gross Market Values by N
summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.set_index('N')

# Format the display
pd.options.display.float_format = "{:,.4f}".format

summary_df.style.format({
    'Max_Individual_Sharpe': "{:,.1%}",
    'Tangency_Sharpe': "{:,.1%}",
    'Gross_Market_Value': "{:,.1%}"
})

	Max_Individual_Sharpe	Tangency_Sharpe	Gross_Market_Value
N
5	90.5%	109.5%	272.2%
20	90.5%	150.8%	449.9%
50	91.9%	202.3%	809.6%
100	108.3%	277.7%	1,888.7%
200	122.6%	409.6%	2,978.0%
442	139.3%	1,038.3%	8,247.8%

# 2x2 Correlation Heatmaps for Various Subsets
from matplotlib.colors import TwoSlopeNorm

# Define subsets to analyze - smallest 3 N values and largest N
N_values_sorted = sorted([5, 20, 50, 100, len(rets_raw.columns)])
subset_configs = [
    {'name': f'N={N_values_sorted[0]}', 'size': N_values_sorted[0], 'title': f'N={N_values_sorted[0]}'},
    {'name': f'N={N_values_sorted[1]}', 'size': N_values_sorted[1], 'title': f'N={N_values_sorted[1]}'},
    {'name': f'N={N_values_sorted[2]}', 'size': N_values_sorted[2], 'title': f'N={N_values_sorted[2]}'},
    {'name': f'N={N_values_sorted[-1]}', 'size': N_values_sorted[-1], 'title': f'N={N_values_sorted[-1]}'}
]

# Use the same random seed for consistency
np.random.seed(2025)
all_assets = list(rets_raw.columns)
np.random.shuffle(all_assets)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for i, config in enumerate(subset_configs):
    # Select assets for this subset
    selected_assets = sorted(all_assets[:config['size']])
    rets_subset = rets_raw[selected_assets]
    
    # Calculate correlation matrix
    corr_matrix = rets_subset.corr()
    
    # Set upper triangle and diagonal to NaN for cleaner visualization
    mask = np.triu(np.ones_like(corr_matrix), k=0)
    corr_matrix_masked = corr_matrix.copy()
    corr_matrix_masked[mask.astype(bool)] = np.nan
    
    # Set up color normalization
    center = 0.33
    vmin = 0
    vmax = 0.67
    norm = TwoSlopeNorm(vmin=vmin, vcenter=center, vmax=vmax)
    
    # Create heatmap
    sns.heatmap(corr_matrix_masked, 
                annot=False,  # No annotations
                norm=norm, 
                cmap='RdBu_r', 
                cbar=True,
                ax=axes[i],
                square=True)
    
    axes[i].set_title(config['title'], fontsize=14, fontweight='bold')
    
    # Adjust tick labels for readability
    if config['size'] <= 20:
        axes[i].tick_params(axis='both', labelsize=8)
    else:
        axes[i].tick_params(axis='both', labelsize=6)
        # For larger matrices, show fewer tick labels
        step = max(1, config['size'] // 10)
        axes[i].set_xticks(range(0, config['size'], step))
        axes[i].set_yticks(range(0, config['size'], step))

plt.tight_layout()
plt.suptitle('Correlation Matrices', 
             fontsize=16, fontweight='bold', y=0.98)
plt.show()

../_images/7aa7372fc41710d66f393a221975823aa0670bb4782e7335e5f4021da816c3c8.png

# Correlation Statistics DataFrame
correlation_stats = []

# Use the same random seed for consistency
np.random.seed(2025)
all_assets = list(rets_raw.columns)
np.random.shuffle(all_assets)

for config in subset_configs:
    # Select assets for this subset
    selected_assets = sorted(all_assets[:config['size']])
    rets_subset = rets_raw[selected_assets]
    
    # Calculate correlation matrix
    corr_matrix = rets_subset.corr()
    
    # Calculate correlation statistics (excluding diagonal)
    mask = np.triu(np.ones_like(corr_matrix), k=1)
    correlations = corr_matrix.values[mask.astype(bool)]
    avg_pairwise_corr = correlations.mean()
    min_corr = correlations.min()
    max_corr = correlations.max()
    
    # Calculate condition number and determinant of correlation matrix
    condition_number = np.linalg.cond(corr_matrix)
    determinant = np.linalg.det(corr_matrix)
    
    correlation_stats.append({
        'N': config['size'],
        'Min Correlation': min_corr,
        'Max Correlation': max_corr,
        'Avg Pairwise Correlation': avg_pairwise_corr,
        'Condition Number': condition_number,
        'Determinant': determinant
    })

# Create DataFrame
correlation_df = pd.DataFrame(correlation_stats)
correlation_df = correlation_df.set_index('N')
# Format the display
display_df = correlation_df.copy()
display_df['Min Correlation'] = display_df['Min Correlation'].apply(lambda x: f"{x:.0%}")
display_df['Max Correlation'] = display_df['Max Correlation'].apply(lambda x: f"{x:.0%}")
display_df['Avg Pairwise Correlation'] = display_df['Avg Pairwise Correlation'].apply(lambda x: f"{x:.0%}")
display_df['Condition Number'] = display_df['Condition Number'].apply(lambda x: f"{x:,.0f}")
display_df['Determinant'] = display_df['Determinant'].apply(lambda x: f"{x:.0e}")

display_df

	Min Correlation	Max Correlation	Avg Pairwise Correlation	Condition Number	Determinant
N
5	21%	43%	30%	4	5e-01
20	9%	72%	34%	33	1e-04
50	-5%	77%	34%	115	1e-13
442	-9%	94%	36%	83,061	0e+00

# Correlation Heatmaps for Most Negative and Most Positive Pairs
from matplotlib.colors import TwoSlopeNorm

# Calculate correlation matrix for full dataset
full_corr_matrix = rets_raw.corr()

# Find all pairwise correlations (excluding diagonal)
mask = np.triu(np.ones_like(full_corr_matrix), k=1)
correlations_flat = full_corr_matrix.values[mask.astype(bool)]

# Get indices of the correlations
row_indices, col_indices = np.where(mask)

# Find the 6 stocks involved in the 3 most negative correlations
min_corr_indices = []
min_corr_values = []
correlations_temp = correlations_flat.copy()
for i in range(3):
    min_idx = np.argmin(correlations_temp)
    min_corr_indices.append((row_indices[min_idx], col_indices[min_idx]))
    min_corr_values.append(correlations_temp[min_idx])
    correlations_temp[min_idx] = 0  # Remove this correlation from consideration

# Get unique stocks from the 3 most negative pairs
min_corr_stocks = set()
for row_idx, col_idx in min_corr_indices:
    min_corr_stocks.add(full_corr_matrix.index[row_idx])
    min_corr_stocks.add(full_corr_matrix.index[col_idx])
min_corr_stocks = sorted(list(min_corr_stocks))

# Find the 6 stocks involved in the 3 most positive correlations
max_corr_indices = []
max_corr_values = []
correlations_temp = correlations_flat.copy()
for i in range(3):
    max_idx = np.argmax(correlations_temp)
    max_corr_indices.append((row_indices[max_idx], col_indices[max_idx]))
    max_corr_values.append(correlations_temp[max_idx])
    correlations_temp[max_idx] = 0  # Remove this correlation from consideration

# Get unique stocks from the 3 most positive pairs
max_corr_stocks = set()
for row_idx, col_idx in max_corr_indices:
    max_corr_stocks.add(full_corr_matrix.index[row_idx])
    max_corr_stocks.add(full_corr_matrix.index[col_idx])
max_corr_stocks = sorted(list(max_corr_stocks))

# Create 1x2 subplot figure
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Set up color normalization
center = 0.33
vmin = 0
vmax = 0.67
norm = TwoSlopeNorm(vmin=vmin, vcenter=center, vmax=vmax)

# Plot 1: Most negative correlations subset
min_corr_subset = full_corr_matrix.loc[min_corr_stocks, min_corr_stocks]
# Set upper triangle and diagonal to NaN
mask_min = np.triu(np.ones_like(min_corr_subset), k=0)
min_corr_subset_masked = min_corr_subset.copy()
min_corr_subset_masked[mask_min.astype(bool)] = np.nan

sns.heatmap(min_corr_subset_masked, 
            annot=True, 
            fmt='.1%', 
            norm=norm, 
            cmap='RdBu_r', 
            cbar=True,
            ax=axes[0],
            square=True)
axes[0].set_title('Most Negative Correlations Subset', fontsize=14, fontweight='bold')
axes[0].set_xlabel('')
axes[0].set_ylabel('')

# Plot 2: Most positive correlations subset
max_corr_subset = full_corr_matrix.loc[max_corr_stocks, max_corr_stocks]
# Set upper triangle and diagonal to NaN
mask_max = np.triu(np.ones_like(max_corr_subset), k=0)
max_corr_subset_masked = max_corr_subset.copy()
max_corr_subset_masked[mask_max.astype(bool)] = np.nan

sns.heatmap(max_corr_subset_masked, 
            annot=True, 
            fmt='.1%', 
            norm=norm, 
            cmap='RdBu_r', 
            cbar=True,
            ax=axes[1],
            square=True)
axes[1].set_title('Most Positive Correlations Subset', fontsize=14, fontweight='bold')
axes[1].set_xlabel('')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

../_images/2f0aea2d9544660238518afe881aa779a48c54b6d7953d95e023580769c947da.png

# Get firm names for the most and least correlated pairs
# Read the first sheet to get ticker to company name mapping
ticker_mapping = pd.read_excel(filepath_data, sheet_name=0)

# Create a dictionary mapping tickers to company names
ticker_to_name = dict(zip(ticker_mapping.iloc[:, 0], ticker_mapping.iloc[:, 1]))

# Create DataFrame for 3 most negative correlations
most_negative_pairs = []
for i, (row_idx, col_idx) in enumerate(min_corr_indices):
    stock1 = full_corr_matrix.index[row_idx]
    stock2 = full_corr_matrix.index[col_idx]
    corr_val = min_corr_values[i]
    
    most_negative_pairs.append({
        'Ticker_1': stock1,
        'Company_1': ticker_to_name.get(stock1, 'Unknown'),
        'Ticker_2': stock2,
        'Company_2': ticker_to_name.get(stock2, 'Unknown'),
        'Correlation': corr_val
    })

most_negative_df = pd.DataFrame(most_negative_pairs)
most_negative_df.index = [f'Pair {i+1}' for i in range(len(most_negative_pairs))]

# Create DataFrame for 3 most positive correlations
most_positive_pairs = []
for i, (row_idx, col_idx) in enumerate(max_corr_indices):
    stock1 = full_corr_matrix.index[row_idx]
    stock2 = full_corr_matrix.index[col_idx]
    corr_val = max_corr_values[i]
    
    most_positive_pairs.append({
        'Ticker_1': stock1,
        'Company_1': ticker_to_name.get(stock1, 'Unknown'),
        'Ticker_2': stock2,
        'Company_2': ticker_to_name.get(stock2, 'Unknown'),
        'Correlation': corr_val
    })

most_positive_df = pd.DataFrame(most_positive_pairs)
most_positive_df.index = [f'Pair {i+1}' for i in range(len(most_positive_pairs))]

# Display the DataFrames
most_negative_df[['Ticker_1', 'Company_1', 'Ticker_2', 'Company_2', 'Correlation']].style.hide(axis='index').format({'Correlation': '{:.0%}'})

Ticker_1	Company_1	Ticker_2	Company_2	Correlation
APA	APA Corp	CLX	Clorox Co/The	-9%
AXON	Axon Enterprise Inc	CPB	The Campbell's Company	-8%
CLX	Clorox Co/The	VLO	Valero Energy Corp	-8%

most_positive_df[['Ticker_1', 'Company_1', 'Ticker_2', 'Company_2', 'Correlation']].style.hide(axis='index').format({'Correlation': '{:.0%}'})

Ticker_1	Company_1	Ticker_2	Company_2	Correlation
AVB	AvalonBay Communities Inc	UDR	UDR Inc	94%
FRT	Federal Realty Investment Trus	REG	Regency Centers Corp	94%
AVB	AvalonBay Communities Inc	EQR	Equity Residential	94%