From 7a4444c93017eb17330dd0455ab583f1af778493 Mon Sep 17 00:00:00 2001 From: bmachado Date: Thu, 11 Dec 2025 16:52:05 +0000 Subject: [PATCH] tag analysis --- scripts/analyze_tags.py | 410 ++++++++++++++++++++++++++++++++++++++++ todo-2.md | 5 + 2 files changed, 415 insertions(+) create mode 100755 scripts/analyze_tags.py create mode 100644 todo-2.md diff --git a/scripts/analyze_tags.py b/scripts/analyze_tags.py new file mode 100755 index 0000000..7ccac6a --- /dev/null +++ b/scripts/analyze_tags.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +""" +Analyze arc_puzzles_tags table for patterns, correlations, and duplicates. +""" + +import os +import sys +import pymysql +from dotenv import load_dotenv +from collections import defaultdict + +def load_env_config(): + """Load database configuration from .env file""" + load_dotenv() + return { + 'host': os.getenv('DB_HOST'), + 'user': os.getenv('DB_USER'), + 'password': os.getenv('DB_PASSWORD'), + 'database': os.getenv('DB_NAME'), + 'port': int(os.getenv('DB_PORT', 3306)), + 'charset': 'utf8mb4' + } + +def print_section(title): + print(f"\n{'=' * 70}") + print(f" {title}") + print('=' * 70) + +def print_table(headers, rows, col_widths=None): + if not col_widths: + col_widths = [max(len(str(h)), max(len(str(r[i])) for r in rows) if rows else 0) + for i, h in enumerate(headers)] + + header_line = " | ".join(str(h).ljust(w) for h, w in zip(headers, col_widths)) + print(header_line) + print("-" * len(header_line)) + for row in rows: + print(" | ".join(str(v).ljust(w) for v, w in zip(row, col_widths))) + +def run_analysis(cursor): + results = {} + + # 1. Most Common Tags + print_section("1. MOST COMMON TAGS") + cursor.execute(""" + SELECT + t.name AS tag_name, + t.type AS tag_type, + COUNT(*) AS usage_count + FROM arc_puzzles_tags apt + JOIN tags t ON apt.tags_id = t.id + GROUP BY t.id, t.name, t.type + ORDER BY usage_count DESC + LIMIT 20 + """) + rows = cursor.fetchall() + print_table(["Tag Name", "Type", "Usage"], rows, [40, 25, 10]) + results['most_common'] = rows + + # 2. Tag Correlations + print_section("2. TAG CORRELATIONS (tags that frequently appear together)") + cursor.execute(""" + SELECT + t1.name AS tag1, + t2.name AS tag2, + COUNT(*) AS co_occurrence + FROM arc_puzzles_tags apt1 + JOIN arc_puzzles_tags apt2 ON apt1.arc_puzzles_id = apt2.arc_puzzles_id AND apt1.tags_id < apt2.tags_id + JOIN tags t1 ON apt1.tags_id = t1.id + JOIN tags t2 ON apt2.tags_id = t2.id + GROUP BY t1.id, t2.id, t1.name, t2.name + HAVING co_occurrence >= 3 + ORDER BY co_occurrence DESC + LIMIT 25 + """) + rows = cursor.fetchall() + print_table(["Tag 1", "Tag 2", "Co-occurrences"], rows, [35, 35, 15]) + results['correlations'] = rows + + # 3. Possible Duplicate Tags + print_section("3. POSSIBLE DUPLICATE TAGS (similar names)") + cursor.execute(""" + SELECT + t1.id AS id1, + t1.name AS name1, + t1.type AS type1, + t2.id AS id2, + t2.name AS name2, + t2.type AS type2 + FROM tags t1 + JOIN tags t2 ON t1.id < t2.id + WHERE + LOWER(REPLACE(t1.name, ' ', '')) = LOWER(REPLACE(t2.name, ' ', '')) + OR LOWER(t1.name) LIKE CONCAT('%', LOWER(t2.name), '%') + OR LOWER(t2.name) LIKE CONCAT('%', LOWER(t1.name), '%') + OR SOUNDEX(t1.name) = SOUNDEX(t2.name) + ORDER BY t1.name + """) + rows = cursor.fetchall() + print_table(["ID1", "Name 1", "Type 1", "ID2", "Name 2", "Type 2"], rows, [5, 30, 15, 5, 30, 15]) + results['duplicates'] = rows + + # 4. Tag Types Distribution + print_section("4. TAG TYPES DISTRIBUTION") + cursor.execute(""" + SELECT + t.type AS tag_type, + COUNT(DISTINCT t.id) AS unique_tags, + COUNT(*) AS total_usage + FROM tags t + LEFT JOIN arc_puzzles_tags apt ON t.id = apt.tags_id + GROUP BY t.type + ORDER BY total_usage DESC + """) + rows = cursor.fetchall() + print_table(["Tag Type", "Unique Tags", "Total Usage"], rows, [25, 15, 15]) + results['types'] = rows + + # 5. Puzzles with Most Tags + print_section("5. PUZZLES WITH MOST TAGS") + cursor.execute(""" + SELECT + apt.arc_puzzles_id, + COUNT(*) AS tag_count, + GROUP_CONCAT(t.name SEPARATOR ', ') AS tags + FROM arc_puzzles_tags apt + JOIN tags t ON apt.tags_id = t.id + GROUP BY apt.arc_puzzles_id + ORDER BY tag_count DESC + LIMIT 10 + """) + rows = cursor.fetchall() + for row in rows: + print(f"\n{row[0]} ({row[1]} tags):") + print(f" {row[2][:100]}..." if len(row[2]) > 100 else f" {row[2]}") + results['most_tagged'] = rows + + # 6. Puzzles with Fewest Tags + print_section("6. PUZZLES WITH FEWEST TAGS") + cursor.execute(""" + SELECT + apt.arc_puzzles_id, + COUNT(*) AS tag_count, + GROUP_CONCAT(t.name SEPARATOR ', ') AS tags + FROM arc_puzzles_tags apt + JOIN tags t ON apt.tags_id = t.id + GROUP BY apt.arc_puzzles_id + ORDER BY tag_count ASC + LIMIT 10 + """) + rows = cursor.fetchall() + for row in rows: + print(f"{row[0]} ({row[1]} tags): {row[2]}") + results['least_tagged'] = rows + + # 7. Duplicate Tag Assignments (same tag assigned multiple times to same puzzle) + print_section("7. DUPLICATE TAG ASSIGNMENTS (same tag on same puzzle by different users)") + cursor.execute(""" + SELECT + apt.arc_puzzles_id, + t.name AS tag_name, + COUNT(*) AS times_assigned, + GROUP_CONCAT(DISTINCT u.name SEPARATOR ', ') AS assigned_by_users + FROM arc_puzzles_tags apt + JOIN tags t ON apt.tags_id = t.id + LEFT JOIN users u ON apt.user_id = u.id + GROUP BY apt.arc_puzzles_id, apt.tags_id, t.name + HAVING times_assigned > 1 + ORDER BY times_assigned DESC + LIMIT 15 + """) + rows = cursor.fetchall() + print_table(["Puzzle ID", "Tag Name", "Times", "Assigned By"], rows, [15, 35, 8, 25]) + results['duplicate_assignments'] = rows + + # 8. User Tagging Activity + print_section("8. USER TAGGING ACTIVITY") + cursor.execute(""" + SELECT + u.name AS user_name, + COUNT(*) AS tags_assigned, + COUNT(DISTINCT apt.arc_puzzles_id) AS puzzles_tagged, + COUNT(DISTINCT apt.tags_id) AS unique_tags_used + FROM arc_puzzles_tags apt + JOIN users u ON apt.user_id = u.id + GROUP BY u.id, u.name + ORDER BY tags_assigned DESC + """) + rows = cursor.fetchall() + print_table(["User", "Tags Assigned", "Puzzles Tagged", "Unique Tags Used"], rows, [20, 15, 15, 17]) + results['user_activity'] = rows + + # 9. Unused Tags + print_section("9. UNUSED TAGS (defined but never assigned)") + cursor.execute(""" + SELECT + t.id, + t.name, + t.type, + COALESCE(t.description, '') as description + FROM tags t + LEFT JOIN arc_puzzles_tags apt ON t.id = apt.tags_id + WHERE apt.id IS NULL + ORDER BY t.type, t.name + """) + rows = cursor.fetchall() + print_table(["ID", "Name", "Type", "Description"], rows, [5, 25, 15, 40]) + results['unused'] = rows + + # 10. Tag Agreement Rate + print_section("10. TAG AGREEMENT RATE (when both users tagged same puzzle)") + cursor.execute(""" + SELECT + t.name AS tag_name, + SUM(CASE WHEN user_count = 2 THEN 1 ELSE 0 END) AS both_users_agreed, + SUM(CASE WHEN user_count = 1 THEN 1 ELSE 0 END) AS only_one_user, + ROUND(SUM(CASE WHEN user_count = 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS agreement_pct + FROM ( + SELECT apt.arc_puzzles_id, apt.tags_id, COUNT(DISTINCT apt.user_id) AS user_count + FROM arc_puzzles_tags apt + GROUP BY apt.arc_puzzles_id, apt.tags_id + ) sub + JOIN tags t ON sub.tags_id = t.id + GROUP BY t.id, t.name + HAVING COUNT(*) >= 5 + ORDER BY agreement_pct DESC + LIMIT 20 + """) + rows = cursor.fetchall() + print_table(["Tag Name", "Both Agreed", "Only One", "Agreement %"], rows, [40, 12, 10, 12]) + results['agreement'] = rows + + # 11. High Agreement Puzzles (puzzles where users agreed on tags) + print_section("11. PUZZLES WITH HIGH TAG AGREEMENT") + cursor.execute(""" + SELECT + sub.arc_puzzles_id, + SUM(CASE WHEN sub.user_count = 2 THEN 1 ELSE 0 END) AS agreed_tags, + SUM(CASE WHEN sub.user_count = 1 THEN 1 ELSE 0 END) AS single_user_tags, + COUNT(*) AS total_unique_tags, + ROUND(SUM(CASE WHEN sub.user_count = 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS agreement_pct + FROM ( + SELECT arc_puzzles_id, tags_id, COUNT(DISTINCT user_id) AS user_count + FROM arc_puzzles_tags + GROUP BY arc_puzzles_id, tags_id + ) sub + WHERE sub.arc_puzzles_id IN ( + SELECT arc_puzzles_id + FROM arc_puzzles_tags + GROUP BY arc_puzzles_id + HAVING COUNT(DISTINCT user_id) > 1 + ) + GROUP BY sub.arc_puzzles_id + ORDER BY agreement_pct DESC, agreed_tags DESC + """) + rows = cursor.fetchall() + print_table(["Puzzle ID", "Agreed", "Single User", "Total Tags", "Agreement %"], rows, [15, 8, 12, 12, 12]) + results['high_agreement_puzzles'] = rows + + # 12. User Overlap Analysis + print_section("12. USER OVERLAP ANALYSIS") + cursor.execute(""" + SELECT + (SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags apt + JOIN users u ON apt.user_id = u.id WHERE u.name = 'Bernardo') AS bernardo_total, + (SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags apt + JOIN users u ON apt.user_id = u.id WHERE u.name = 'Eric') AS eric_total, + (SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags + WHERE arc_puzzles_id IN ( + SELECT arc_puzzles_id FROM arc_puzzles_tags + GROUP BY arc_puzzles_id HAVING COUNT(DISTINCT user_id) > 1 + )) AS both_tagged + """) + row = cursor.fetchone() + print(f" Bernardo tagged: {row[0]} puzzles") + print(f" Eric tagged: {row[1]} puzzles") + print(f" Both tagged (overlap): {row[2]} puzzles") + print(f" Overlap rate: {round(row[2] * 100.0 / min(row[0], row[1]), 1)}%") + results['user_overlap'] = row + + # 13. Detailed Agreement for Overlapping Puzzles + print_section("13. DETAILED TAG AGREEMENT FOR OVERLAPPING PUZZLES") + cursor.execute(""" + SELECT + apt.arc_puzzles_id, + t.name AS tag_name, + t.type AS tag_type, + GROUP_CONCAT(DISTINCT u.name ORDER BY u.name SEPARATOR ', ') AS tagged_by, + COUNT(DISTINCT apt.user_id) AS user_count, + CASE WHEN COUNT(DISTINCT apt.user_id) = 2 THEN 'AGREED' ELSE 'SINGLE' END AS status + FROM arc_puzzles_tags apt + JOIN tags t ON apt.tags_id = t.id + JOIN users u ON apt.user_id = u.id + WHERE apt.arc_puzzles_id IN ( + SELECT arc_puzzles_id + FROM arc_puzzles_tags + GROUP BY arc_puzzles_id + HAVING COUNT(DISTINCT user_id) > 1 + ) + GROUP BY apt.arc_puzzles_id, apt.tags_id, t.name, t.type + ORDER BY apt.arc_puzzles_id, user_count DESC, t.name + """) + rows = cursor.fetchall() + + current_puzzle = None + for row in rows: + if row[0] != current_puzzle: + current_puzzle = row[0] + print(f"\n Puzzle: {current_puzzle}") + print(f" {'-' * 60}") + status_marker = "+" if row[5] == 'AGREED' else " " + print(f" {status_marker} {row[1]:<35} ({row[2]:<20}) - {row[3]}") + results['detailed_agreement'] = rows + + # 14. Tags Users Tend to Disagree On + print_section("14. TAGS WITH MOST DISAGREEMENT (one user tags, other doesn't)") + cursor.execute(""" + SELECT + t.name AS tag_name, + t.type AS tag_type, + u.name AS only_tagged_by, + COUNT(*) AS times_only_one_tagged + FROM arc_puzzles_tags apt + JOIN tags t ON apt.tags_id = t.id + JOIN users u ON apt.user_id = u.id + WHERE (apt.arc_puzzles_id, apt.tags_id) IN ( + SELECT arc_puzzles_id, tags_id + FROM arc_puzzles_tags + GROUP BY arc_puzzles_id, tags_id + HAVING COUNT(DISTINCT user_id) = 1 + ) + AND apt.arc_puzzles_id IN ( + SELECT arc_puzzles_id + FROM arc_puzzles_tags + GROUP BY arc_puzzles_id + HAVING COUNT(DISTINCT user_id) > 1 + ) + GROUP BY t.id, t.name, t.type, u.name + ORDER BY times_only_one_tagged DESC + LIMIT 20 + """) + rows = cursor.fetchall() + print_table(["Tag Name", "Type", "Only Tagged By", "Times"], rows, [35, 20, 15, 8]) + results['disagreement_tags'] = rows + + # 15. Tags Users Consistently Agree On + print_section("15. TAGS USERS CONSISTENTLY AGREE ON") + cursor.execute(""" + SELECT + t.name AS tag_name, + t.type AS tag_type, + COUNT(*) AS times_both_agreed + FROM ( + SELECT arc_puzzles_id, tags_id + FROM arc_puzzles_tags + GROUP BY arc_puzzles_id, tags_id + HAVING COUNT(DISTINCT user_id) = 2 + ) agreed + JOIN tags t ON agreed.tags_id = t.id + GROUP BY t.id, t.name, t.type + ORDER BY times_both_agreed DESC + """) + rows = cursor.fetchall() + print_table(["Tag Name", "Type", "Times Agreed"], rows, [40, 20, 12]) + results['agreement_tags'] = rows + + return results + +def main(): + print("ARC Puzzles Tags Analysis") + print("=" * 70) + + try: + config = load_env_config() + print(f"Connecting to {config['host']}:{config['port']}/{config['database']}...") + connection = pymysql.connect(**config) + print("Connected successfully!\n") + except Exception as e: + print(f"Error connecting to database: {e}") + return 1 + + try: + cursor = connection.cursor() + results = run_analysis(cursor) + + # Summary + print_section("SUMMARY") + print(f"Total tag assignments analyzed") + cursor.execute("SELECT COUNT(*) FROM arc_puzzles_tags") + print(f" - Total assignments: {cursor.fetchone()[0]}") + cursor.execute("SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags") + print(f" - Puzzles with tags: {cursor.fetchone()[0]}") + cursor.execute("SELECT COUNT(*) FROM tags") + print(f" - Total tags defined: {cursor.fetchone()[0]}") + print(f" - Unused tags: {len(results['unused'])}") + print(f" - Possible duplicate tags: {len(results['duplicates'])}") + + except Exception as e: + print(f"\nError during analysis: {e}") + import traceback + traceback.print_exc() + return 1 + finally: + connection.close() + print(f"\nDatabase connection closed.") + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/todo-2.md b/todo-2.md new file mode 100644 index 0000000..b7f409d --- /dev/null +++ b/todo-2.md @@ -0,0 +1,5 @@ +- [x] Editing of Existing Answers +- [x] Editing of Existing Skills +- [x] Add the problem viewer to the User Eval Viewer. +- Refresh the list in the user eval viewer when selected +- In the "Explorer" tab I'd like to see a way to link to the annotations for each retrieved example.