"""Compute correlation statistics for the paper (hop-based).""" import json import numpy as np from scipy import stats def main(): with open(r'C:\Users\alexa\copycat\slipnet_analysis\slipnet.json', 'r') as f: data = json.load(f) # Extract data points (excluding letter nodes themselves) names = [] depths = [] hops = [] is_unreachable = [] for node in data['nodes']: name = node['name'] depth = node['conceptualDepth'] path_info = node.get('minPathToLetter', {}) hop_count = path_info.get('hops') nearest = path_info.get('nearestLetter') # Skip letter nodes (hops 0) if hop_count is not None and hop_count > 0: names.append(name) depths.append(depth) hops.append(hop_count) is_unreachable.append(nearest is None) # Convert to numpy arrays depths = np.array(depths) hops = np.array(hops) # Compute correlation correlation, p_value = stats.pearsonr(depths, hops) spearman_corr, spearman_p = stats.spearmanr(depths, hops) # Linear regression z = np.polyfit(depths, hops, 1) # R-squared y_pred = np.polyval(z, depths) ss_res = np.sum((hops - y_pred) ** 2) ss_tot = np.sum((hops - np.mean(hops)) ** 2) r_squared = 1 - (ss_res / ss_tot) num_unreachable = sum(is_unreachable) print(f"Number of nodes analyzed: {len(names)}") print(f"Total nodes: {data['nodeCount']}") print(f"Letter nodes (excluded): 26") print(f"Unreachable nodes (hops = 2*max): {num_unreachable}") print() print(f"Pearson correlation: r = {correlation:.4f}") print(f"Pearson p-value: p = {p_value:.6f}") print(f"Spearman correlation: rho = {spearman_corr:.4f}") print(f"Spearman p-value: p = {spearman_p:.6f}") print(f"R-squared: {r_squared:.4f}") print(f"Linear regression: hops = {z[0]:.4f} * depth + {z[1]:.4f}") print() print(f"Depth range: {min(depths):.1f} - {max(depths):.1f}") print(f"Hops range: {min(hops)} - {max(hops)}") print(f"Mean depth: {np.mean(depths):.2f}") print(f"Mean hops: {np.mean(hops):.2f}") print(f"Std depth: {np.std(depths):.2f}") print(f"Std hops: {np.std(hops):.2f}") print() # Distribution of hops print("Distribution of hops:") for h in sorted(set(hops)): count = sum(1 for x in hops if x == h) nodes_at_h = [n for n, hp in zip(names, hops) if hp == h] print(f" {h} hops: {count} nodes") print() print("Data points (sorted by hops, then depth):") print(f"{'Node':<30} {'Depth':<10} {'Hops':<10} {'Reachable':<10}") print("-" * 60) for name, depth, hop, unreachable in sorted(zip(names, depths, hops, is_unreachable), key=lambda x: (x[2], x[1])): status = "No" if unreachable else "Yes" print(f"{name:<30} {depth:<10.1f} {hop:<10} {status:<10}") if __name__ == '__main__': main()