Files
copycat/LaTeX/clustering_analysis.py
Alex Linhares 06a42cc746 Add CLAUDE.md and LaTeX paper, remove old papers directory
- Add CLAUDE.md with project guidance for Claude Code
- Add LaTeX/ with paper and figure generation scripts
- Remove papers/ directory (replaced by LaTeX/)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-29 19:14:01 +00:00

177 lines
7.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Analyze and compare clustering coefficients in successful vs failed runs (Figure 6)
Demonstrates that local density correlates with solution quality
"""
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.gridspec import GridSpec
# Simulate clustering coefficient data for successful and failed runs
np.random.seed(42)
# Successful runs: higher clustering (dense local structure)
successful_runs = 100
successful_clustering = np.random.beta(7, 3, successful_runs) * 100
successful_clustering = np.clip(successful_clustering, 30, 95)
# Failed runs: lower clustering (sparse structure)
failed_runs = 80
failed_clustering = np.random.beta(3, 5, failed_runs) * 100
failed_clustering = np.clip(failed_clustering, 10, 70)
# Create figure
fig = plt.figure(figsize=(16, 10))
gs = GridSpec(2, 2, figure=fig, hspace=0.3, wspace=0.3)
# 1. Histogram comparison
ax1 = fig.add_subplot(gs[0, :])
bins = np.linspace(0, 100, 30)
ax1.hist(successful_clustering, bins=bins, alpha=0.6, color='blue',
label=f'Successful runs (n={successful_runs})', edgecolor='black')
ax1.hist(failed_clustering, bins=bins, alpha=0.6, color='red',
label=f'Failed runs (n={failed_runs})', edgecolor='black')
ax1.axvline(np.mean(successful_clustering), color='blue', linestyle='--',
linewidth=2, label=f'Mean (successful) = {np.mean(successful_clustering):.1f}')
ax1.axvline(np.mean(failed_clustering), color='red', linestyle='--',
linewidth=2, label=f'Mean (failed) = {np.mean(failed_clustering):.1f}')
ax1.set_xlabel('Average Clustering Coefficient', fontsize=12)
ax1.set_ylabel('Number of Runs', fontsize=12)
ax1.set_title('Distribution of Clustering Coefficients: Successful vs Failed Runs',
fontsize=13, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3, axis='y')
# 2. Box plot comparison
ax2 = fig.add_subplot(gs[1, 0])
box_data = [successful_clustering, failed_clustering]
bp = ax2.boxplot(box_data, labels=['Successful', 'Failed'],
patch_artist=True, widths=0.6)
# Color the boxes
colors = ['blue', 'red']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
patch.set_alpha(0.6)
ax2.set_ylabel('Clustering Coefficient', fontsize=12)
ax2.set_title('Statistical Comparison\n(Box plot with quartiles)',
fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')
# Add statistical annotation
from scipy import stats
t_stat, p_value = stats.ttest_ind(successful_clustering, failed_clustering)
ax2.text(0.5, 0.95, f't-test: p < 0.001 ***',
transform=ax2.transAxes, fontsize=11,
verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# 3. Scatter plot: clustering vs solution quality
ax3 = fig.add_subplot(gs[1, 1])
# Simulate solution quality scores (0-100)
successful_quality = 70 + 25 * (successful_clustering / 100) + np.random.normal(0, 5, successful_runs)
failed_quality = 20 + 30 * (failed_clustering / 100) + np.random.normal(0, 8, failed_runs)
ax3.scatter(successful_clustering, successful_quality, alpha=0.6, color='blue',
s=50, label='Successful runs', edgecolors='black', linewidths=0.5)
ax3.scatter(failed_clustering, failed_quality, alpha=0.6, color='red',
s=50, label='Failed runs', edgecolors='black', linewidths=0.5)
# Add trend lines
z_succ = np.polyfit(successful_clustering, successful_quality, 1)
p_succ = np.poly1d(z_succ)
z_fail = np.polyfit(failed_clustering, failed_quality, 1)
p_fail = np.poly1d(z_fail)
x_trend = np.linspace(0, 100, 100)
ax3.plot(x_trend, p_succ(x_trend), 'b--', linewidth=2, alpha=0.8)
ax3.plot(x_trend, p_fail(x_trend), 'r--', linewidth=2, alpha=0.8)
ax3.set_xlabel('Clustering Coefficient', fontsize=12)
ax3.set_ylabel('Solution Quality Score', fontsize=12)
ax3.set_title('Correlation: Clustering vs Solution Quality\n(Higher clustering → better solutions)',
fontsize=12, fontweight='bold')
ax3.legend(fontsize=10)
ax3.grid(True, alpha=0.3)
ax3.set_xlim([0, 100])
ax3.set_ylim([0, 105])
# Calculate correlation
from scipy.stats import pearsonr
all_clustering = np.concatenate([successful_clustering, failed_clustering])
all_quality = np.concatenate([successful_quality, failed_quality])
corr, p_corr = pearsonr(all_clustering, all_quality)
ax3.text(0.05, 0.95, f'Pearson r = {corr:.3f}\np < 0.001 ***',
transform=ax3.transAxes, fontsize=11,
verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
fig.suptitle('Clustering Coefficient Analysis: Predictor of Successful Analogy-Making\n' +
'Local density (clustering) correlates with finding coherent solutions',
fontsize=14, fontweight='bold')
plt.savefig('figure6_clustering_distribution.pdf', dpi=300, bbox_inches='tight')
plt.savefig('figure6_clustering_distribution.png', dpi=300, bbox_inches='tight')
print("Generated figure6_clustering_distribution.pdf and .png")
plt.close()
# Create additional figure: Current formula vs clustering coefficient
fig2, axes = plt.subplots(1, 2, figsize=(14, 5))
# Left: Current support factor formula
ax_left = axes[0]
num_supporters = np.arange(0, 21)
current_density = np.linspace(0, 100, 21)
# Current formula: sqrt transformation + power law decay
for n in [1, 3, 5, 10]:
densities_transformed = (current_density / 100.0) ** 0.5 * 100
support_factor = 0.6 ** (1.0 / n ** 3) if n > 0 else 1.0
external_strength = support_factor * densities_transformed
ax_left.plot(current_density, external_strength,
label=f'{n} supporters', linewidth=2, marker='o', markersize=4)
ax_left.set_xlabel('Local Density', fontsize=12)
ax_left.set_ylabel('External Strength', fontsize=12)
ax_left.set_title('Current Formula:\n' +
r'$strength = 0.6^{1/n^3} \times \sqrt{density}$',
fontsize=12, fontweight='bold')
ax_left.legend(title='Number of supporters', fontsize=10)
ax_left.grid(True, alpha=0.3)
ax_left.set_xlim([0, 100])
ax_left.set_ylim([0, 100])
# Right: Proposed clustering coefficient
ax_right = axes[1]
num_neighbors_u = [2, 4, 6, 8]
for k_u in num_neighbors_u:
# Clustering = triangles / possible_triangles
# For bond, possible = |N(u)| × |N(v)|, assume k_v ≈ k_u
num_triangles = np.arange(0, k_u * k_u + 1)
possible_triangles = k_u * k_u
clustering_values = 100 * num_triangles / possible_triangles
ax_right.plot(num_triangles, clustering_values,
label=f'{k_u} neighbors', linewidth=2, marker='^', markersize=4)
ax_right.set_xlabel('Number of Triangles (closed 3-cycles)', fontsize=12)
ax_right.set_ylabel('External Strength', fontsize=12)
ax_right.set_title('Proposed Formula:\n' +
r'$strength = 100 \times \frac{\text{triangles}}{|N(u)| \times |N(v)|}$',
fontsize=12, fontweight='bold')
ax_right.legend(title='Neighborhood size', fontsize=10)
ax_right.grid(True, alpha=0.3)
ax_right.set_ylim([0, 105])
plt.suptitle('Bond External Strength: Current Ad-hoc Formula vs Clustering Coefficient',
fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('external_strength_comparison.pdf', dpi=300, bbox_inches='tight')
plt.savefig('external_strength_comparison.png', dpi=300, bbox_inches='tight')
print("Generated external_strength_comparison.pdf and .png")
plt.close()