From 7a4444c93017eb17330dd0455ab583f1af778493 Mon Sep 17 00:00:00 2001
From: bmachado <bmachado@zatio.com>
Date: Thu, 11 Dec 2025 16:52:05 +0000
Subject: [PATCH] tag analysis

---
 scripts/analyze_tags.py | 410 ++++++++++++++++++++++++++++++++++++++++
 todo-2.md               |   5 +
 2 files changed, 415 insertions(+)
 create mode 100755 scripts/analyze_tags.py
 create mode 100644 todo-2.md

diff --git a/scripts/analyze_tags.py b/scripts/analyze_tags.py
new file mode 100755
index 0000000..7ccac6a
--- /dev/null
+++ b/scripts/analyze_tags.py
@@ -0,0 +1,410 @@
+#!/usr/bin/env python3
+"""
+Analyze arc_puzzles_tags table for patterns, correlations, and duplicates.
+"""
+
+import os
+import sys
+import pymysql
+from dotenv import load_dotenv
+from collections import defaultdict
+
+def load_env_config():
+    """Load database configuration from .env file"""
+    load_dotenv()
+    return {
+        'host': os.getenv('DB_HOST'),
+        'user': os.getenv('DB_USER'),
+        'password': os.getenv('DB_PASSWORD'),
+        'database': os.getenv('DB_NAME'),
+        'port': int(os.getenv('DB_PORT', 3306)),
+        'charset': 'utf8mb4'
+    }
+
+def print_section(title):
+    print(f"\n{'=' * 70}")
+    print(f" {title}")
+    print('=' * 70)
+
+def print_table(headers, rows, col_widths=None):
+    if not col_widths:
+        col_widths = [max(len(str(h)), max(len(str(r[i])) for r in rows) if rows else 0)
+                      for i, h in enumerate(headers)]
+
+    header_line = " | ".join(str(h).ljust(w) for h, w in zip(headers, col_widths))
+    print(header_line)
+    print("-" * len(header_line))
+    for row in rows:
+        print(" | ".join(str(v).ljust(w) for v, w in zip(row, col_widths)))
+
+def run_analysis(cursor):
+    results = {}
+
+    # 1. Most Common Tags
+    print_section("1. MOST COMMON TAGS")
+    cursor.execute("""
+        SELECT
+            t.name AS tag_name,
+            t.type AS tag_type,
+            COUNT(*) AS usage_count
+        FROM arc_puzzles_tags apt
+        JOIN tags t ON apt.tags_id = t.id
+        GROUP BY t.id, t.name, t.type
+        ORDER BY usage_count DESC
+        LIMIT 20
+    """)
+    rows = cursor.fetchall()
+    print_table(["Tag Name", "Type", "Usage"], rows, [40, 25, 10])
+    results['most_common'] = rows
+
+    # 2. Tag Correlations
+    print_section("2. TAG CORRELATIONS (tags that frequently appear together)")
+    cursor.execute("""
+        SELECT
+            t1.name AS tag1,
+            t2.name AS tag2,
+            COUNT(*) AS co_occurrence
+        FROM arc_puzzles_tags apt1
+        JOIN arc_puzzles_tags apt2 ON apt1.arc_puzzles_id = apt2.arc_puzzles_id AND apt1.tags_id < apt2.tags_id
+        JOIN tags t1 ON apt1.tags_id = t1.id
+        JOIN tags t2 ON apt2.tags_id = t2.id
+        GROUP BY t1.id, t2.id, t1.name, t2.name
+        HAVING co_occurrence >= 3
+        ORDER BY co_occurrence DESC
+        LIMIT 25
+    """)
+    rows = cursor.fetchall()
+    print_table(["Tag 1", "Tag 2", "Co-occurrences"], rows, [35, 35, 15])
+    results['correlations'] = rows
+
+    # 3. Possible Duplicate Tags
+    print_section("3. POSSIBLE DUPLICATE TAGS (similar names)")
+    cursor.execute("""
+        SELECT
+            t1.id AS id1,
+            t1.name AS name1,
+            t1.type AS type1,
+            t2.id AS id2,
+            t2.name AS name2,
+            t2.type AS type2
+        FROM tags t1
+        JOIN tags t2 ON t1.id < t2.id
+        WHERE
+            LOWER(REPLACE(t1.name, ' ', '')) = LOWER(REPLACE(t2.name, ' ', ''))
+            OR LOWER(t1.name) LIKE CONCAT('%', LOWER(t2.name), '%')
+            OR LOWER(t2.name) LIKE CONCAT('%', LOWER(t1.name), '%')
+            OR SOUNDEX(t1.name) = SOUNDEX(t2.name)
+        ORDER BY t1.name
+    """)
+    rows = cursor.fetchall()
+    print_table(["ID1", "Name 1", "Type 1", "ID2", "Name 2", "Type 2"], rows, [5, 30, 15, 5, 30, 15])
+    results['duplicates'] = rows
+
+    # 4. Tag Types Distribution
+    print_section("4. TAG TYPES DISTRIBUTION")
+    cursor.execute("""
+        SELECT
+            t.type AS tag_type,
+            COUNT(DISTINCT t.id) AS unique_tags,
+            COUNT(*) AS total_usage
+        FROM tags t
+        LEFT JOIN arc_puzzles_tags apt ON t.id = apt.tags_id
+        GROUP BY t.type
+        ORDER BY total_usage DESC
+    """)
+    rows = cursor.fetchall()
+    print_table(["Tag Type", "Unique Tags", "Total Usage"], rows, [25, 15, 15])
+    results['types'] = rows
+
+    # 5. Puzzles with Most Tags
+    print_section("5. PUZZLES WITH MOST TAGS")
+    cursor.execute("""
+        SELECT
+            apt.arc_puzzles_id,
+            COUNT(*) AS tag_count,
+            GROUP_CONCAT(t.name SEPARATOR ', ') AS tags
+        FROM arc_puzzles_tags apt
+        JOIN tags t ON apt.tags_id = t.id
+        GROUP BY apt.arc_puzzles_id
+        ORDER BY tag_count DESC
+        LIMIT 10
+    """)
+    rows = cursor.fetchall()
+    for row in rows:
+        print(f"\n{row[0]} ({row[1]} tags):")
+        print(f"  {row[2][:100]}..." if len(row[2]) > 100 else f"  {row[2]}")
+    results['most_tagged'] = rows
+
+    # 6. Puzzles with Fewest Tags
+    print_section("6. PUZZLES WITH FEWEST TAGS")
+    cursor.execute("""
+        SELECT
+            apt.arc_puzzles_id,
+            COUNT(*) AS tag_count,
+            GROUP_CONCAT(t.name SEPARATOR ', ') AS tags
+        FROM arc_puzzles_tags apt
+        JOIN tags t ON apt.tags_id = t.id
+        GROUP BY apt.arc_puzzles_id
+        ORDER BY tag_count ASC
+        LIMIT 10
+    """)
+    rows = cursor.fetchall()
+    for row in rows:
+        print(f"{row[0]} ({row[1]} tags): {row[2]}")
+    results['least_tagged'] = rows
+
+    # 7. Duplicate Tag Assignments (same tag assigned multiple times to same puzzle)
+    print_section("7. DUPLICATE TAG ASSIGNMENTS (same tag on same puzzle by different users)")
+    cursor.execute("""
+        SELECT
+            apt.arc_puzzles_id,
+            t.name AS tag_name,
+            COUNT(*) AS times_assigned,
+            GROUP_CONCAT(DISTINCT u.name SEPARATOR ', ') AS assigned_by_users
+        FROM arc_puzzles_tags apt
+        JOIN tags t ON apt.tags_id = t.id
+        LEFT JOIN users u ON apt.user_id = u.id
+        GROUP BY apt.arc_puzzles_id, apt.tags_id, t.name
+        HAVING times_assigned > 1
+        ORDER BY times_assigned DESC
+        LIMIT 15
+    """)
+    rows = cursor.fetchall()
+    print_table(["Puzzle ID", "Tag Name", "Times", "Assigned By"], rows, [15, 35, 8, 25])
+    results['duplicate_assignments'] = rows
+
+    # 8. User Tagging Activity
+    print_section("8. USER TAGGING ACTIVITY")
+    cursor.execute("""
+        SELECT
+            u.name AS user_name,
+            COUNT(*) AS tags_assigned,
+            COUNT(DISTINCT apt.arc_puzzles_id) AS puzzles_tagged,
+            COUNT(DISTINCT apt.tags_id) AS unique_tags_used
+        FROM arc_puzzles_tags apt
+        JOIN users u ON apt.user_id = u.id
+        GROUP BY u.id, u.name
+        ORDER BY tags_assigned DESC
+    """)
+    rows = cursor.fetchall()
+    print_table(["User", "Tags Assigned", "Puzzles Tagged", "Unique Tags Used"], rows, [20, 15, 15, 17])
+    results['user_activity'] = rows
+
+    # 9. Unused Tags
+    print_section("9. UNUSED TAGS (defined but never assigned)")
+    cursor.execute("""
+        SELECT
+            t.id,
+            t.name,
+            t.type,
+            COALESCE(t.description, '') as description
+        FROM tags t
+        LEFT JOIN arc_puzzles_tags apt ON t.id = apt.tags_id
+        WHERE apt.id IS NULL
+        ORDER BY t.type, t.name
+    """)
+    rows = cursor.fetchall()
+    print_table(["ID", "Name", "Type", "Description"], rows, [5, 25, 15, 40])
+    results['unused'] = rows
+
+    # 10. Tag Agreement Rate
+    print_section("10. TAG AGREEMENT RATE (when both users tagged same puzzle)")
+    cursor.execute("""
+        SELECT
+            t.name AS tag_name,
+            SUM(CASE WHEN user_count = 2 THEN 1 ELSE 0 END) AS both_users_agreed,
+            SUM(CASE WHEN user_count = 1 THEN 1 ELSE 0 END) AS only_one_user,
+            ROUND(SUM(CASE WHEN user_count = 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS agreement_pct
+        FROM (
+            SELECT apt.arc_puzzles_id, apt.tags_id, COUNT(DISTINCT apt.user_id) AS user_count
+            FROM arc_puzzles_tags apt
+            GROUP BY apt.arc_puzzles_id, apt.tags_id
+        ) sub
+        JOIN tags t ON sub.tags_id = t.id
+        GROUP BY t.id, t.name
+        HAVING COUNT(*) >= 5
+        ORDER BY agreement_pct DESC
+        LIMIT 20
+    """)
+    rows = cursor.fetchall()
+    print_table(["Tag Name", "Both Agreed", "Only One", "Agreement %"], rows, [40, 12, 10, 12])
+    results['agreement'] = rows
+
+    # 11. High Agreement Puzzles (puzzles where users agreed on tags)
+    print_section("11. PUZZLES WITH HIGH TAG AGREEMENT")
+    cursor.execute("""
+        SELECT
+            sub.arc_puzzles_id,
+            SUM(CASE WHEN sub.user_count = 2 THEN 1 ELSE 0 END) AS agreed_tags,
+            SUM(CASE WHEN sub.user_count = 1 THEN 1 ELSE 0 END) AS single_user_tags,
+            COUNT(*) AS total_unique_tags,
+            ROUND(SUM(CASE WHEN sub.user_count = 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS agreement_pct
+        FROM (
+            SELECT arc_puzzles_id, tags_id, COUNT(DISTINCT user_id) AS user_count
+            FROM arc_puzzles_tags
+            GROUP BY arc_puzzles_id, tags_id
+        ) sub
+        WHERE sub.arc_puzzles_id IN (
+            SELECT arc_puzzles_id
+            FROM arc_puzzles_tags
+            GROUP BY arc_puzzles_id
+            HAVING COUNT(DISTINCT user_id) > 1
+        )
+        GROUP BY sub.arc_puzzles_id
+        ORDER BY agreement_pct DESC, agreed_tags DESC
+    """)
+    rows = cursor.fetchall()
+    print_table(["Puzzle ID", "Agreed", "Single User", "Total Tags", "Agreement %"], rows, [15, 8, 12, 12, 12])
+    results['high_agreement_puzzles'] = rows
+
+    # 12. User Overlap Analysis
+    print_section("12. USER OVERLAP ANALYSIS")
+    cursor.execute("""
+        SELECT
+            (SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags apt
+             JOIN users u ON apt.user_id = u.id WHERE u.name = 'Bernardo') AS bernardo_total,
+            (SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags apt
+             JOIN users u ON apt.user_id = u.id WHERE u.name = 'Eric') AS eric_total,
+            (SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags
+             WHERE arc_puzzles_id IN (
+                SELECT arc_puzzles_id FROM arc_puzzles_tags
+                GROUP BY arc_puzzles_id HAVING COUNT(DISTINCT user_id) > 1
+            )) AS both_tagged
+    """)
+    row = cursor.fetchone()
+    print(f"  Bernardo tagged: {row[0]} puzzles")
+    print(f"  Eric tagged: {row[1]} puzzles")
+    print(f"  Both tagged (overlap): {row[2]} puzzles")
+    print(f"  Overlap rate: {round(row[2] * 100.0 / min(row[0], row[1]), 1)}%")
+    results['user_overlap'] = row
+
+    # 13. Detailed Agreement for Overlapping Puzzles
+    print_section("13. DETAILED TAG AGREEMENT FOR OVERLAPPING PUZZLES")
+    cursor.execute("""
+        SELECT
+            apt.arc_puzzles_id,
+            t.name AS tag_name,
+            t.type AS tag_type,
+            GROUP_CONCAT(DISTINCT u.name ORDER BY u.name SEPARATOR ', ') AS tagged_by,
+            COUNT(DISTINCT apt.user_id) AS user_count,
+            CASE WHEN COUNT(DISTINCT apt.user_id) = 2 THEN 'AGREED' ELSE 'SINGLE' END AS status
+        FROM arc_puzzles_tags apt
+        JOIN tags t ON apt.tags_id = t.id
+        JOIN users u ON apt.user_id = u.id
+        WHERE apt.arc_puzzles_id IN (
+            SELECT arc_puzzles_id
+            FROM arc_puzzles_tags
+            GROUP BY arc_puzzles_id
+            HAVING COUNT(DISTINCT user_id) > 1
+        )
+        GROUP BY apt.arc_puzzles_id, apt.tags_id, t.name, t.type
+        ORDER BY apt.arc_puzzles_id, user_count DESC, t.name
+    """)
+    rows = cursor.fetchall()
+
+    current_puzzle = None
+    for row in rows:
+        if row[0] != current_puzzle:
+            current_puzzle = row[0]
+            print(f"\n  Puzzle: {current_puzzle}")
+            print(f"  {'-' * 60}")
+        status_marker = "+" if row[5] == 'AGREED' else " "
+        print(f"    {status_marker} {row[1]:<35} ({row[2]:<20}) - {row[3]}")
+    results['detailed_agreement'] = rows
+
+    # 14. Tags Users Tend to Disagree On
+    print_section("14. TAGS WITH MOST DISAGREEMENT (one user tags, other doesn't)")
+    cursor.execute("""
+        SELECT
+            t.name AS tag_name,
+            t.type AS tag_type,
+            u.name AS only_tagged_by,
+            COUNT(*) AS times_only_one_tagged
+        FROM arc_puzzles_tags apt
+        JOIN tags t ON apt.tags_id = t.id
+        JOIN users u ON apt.user_id = u.id
+        WHERE (apt.arc_puzzles_id, apt.tags_id) IN (
+            SELECT arc_puzzles_id, tags_id
+            FROM arc_puzzles_tags
+            GROUP BY arc_puzzles_id, tags_id
+            HAVING COUNT(DISTINCT user_id) = 1
+        )
+        AND apt.arc_puzzles_id IN (
+            SELECT arc_puzzles_id
+            FROM arc_puzzles_tags
+            GROUP BY arc_puzzles_id
+            HAVING COUNT(DISTINCT user_id) > 1
+        )
+        GROUP BY t.id, t.name, t.type, u.name
+        ORDER BY times_only_one_tagged DESC
+        LIMIT 20
+    """)
+    rows = cursor.fetchall()
+    print_table(["Tag Name", "Type", "Only Tagged By", "Times"], rows, [35, 20, 15, 8])
+    results['disagreement_tags'] = rows
+
+    # 15. Tags Users Consistently Agree On
+    print_section("15. TAGS USERS CONSISTENTLY AGREE ON")
+    cursor.execute("""
+        SELECT
+            t.name AS tag_name,
+            t.type AS tag_type,
+            COUNT(*) AS times_both_agreed
+        FROM (
+            SELECT arc_puzzles_id, tags_id
+            FROM arc_puzzles_tags
+            GROUP BY arc_puzzles_id, tags_id
+            HAVING COUNT(DISTINCT user_id) = 2
+        ) agreed
+        JOIN tags t ON agreed.tags_id = t.id
+        GROUP BY t.id, t.name, t.type
+        ORDER BY times_both_agreed DESC
+    """)
+    rows = cursor.fetchall()
+    print_table(["Tag Name", "Type", "Times Agreed"], rows, [40, 20, 12])
+    results['agreement_tags'] = rows
+
+    return results
+
+def main():
+    print("ARC Puzzles Tags Analysis")
+    print("=" * 70)
+
+    try:
+        config = load_env_config()
+        print(f"Connecting to {config['host']}:{config['port']}/{config['database']}...")
+        connection = pymysql.connect(**config)
+        print("Connected successfully!\n")
+    except Exception as e:
+        print(f"Error connecting to database: {e}")
+        return 1
+
+    try:
+        cursor = connection.cursor()
+        results = run_analysis(cursor)
+
+        # Summary
+        print_section("SUMMARY")
+        print(f"Total tag assignments analyzed")
+        cursor.execute("SELECT COUNT(*) FROM arc_puzzles_tags")
+        print(f"  - Total assignments: {cursor.fetchone()[0]}")
+        cursor.execute("SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags")
+        print(f"  - Puzzles with tags: {cursor.fetchone()[0]}")
+        cursor.execute("SELECT COUNT(*) FROM tags")
+        print(f"  - Total tags defined: {cursor.fetchone()[0]}")
+        print(f"  - Unused tags: {len(results['unused'])}")
+        print(f"  - Possible duplicate tags: {len(results['duplicates'])}")
+
+    except Exception as e:
+        print(f"\nError during analysis: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    finally:
+        connection.close()
+        print(f"\nDatabase connection closed.")
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/todo-2.md b/todo-2.md
new file mode 100644
index 0000000..b7f409d
--- /dev/null
+++ b/todo-2.md
@@ -0,0 +1,5 @@
+- [x] Editing of Existing Answers
+- [x] Editing of Existing Skills
+- [x] Add the problem viewer to the User Eval Viewer.
+- Refresh the list in the user eval viewer when selected
+- In the "Explorer" tab I'd like to see a way to link to the annotations for each retrieved example.