tag analysis

This commit is contained in:
bmachado
2025-12-11 16:52:05 +00:00
parent 5eafafa47b
commit 7a4444c930
2 changed files with 415 additions and 0 deletions

410
scripts/analyze_tags.py Executable file
View File

@ -0,0 +1,410 @@
#!/usr/bin/env python3
"""
Analyze arc_puzzles_tags table for patterns, correlations, and duplicates.
"""
import os
import sys
import pymysql
from dotenv import load_dotenv
from collections import defaultdict
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
return {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
def print_section(title):
print(f"\n{'=' * 70}")
print(f" {title}")
print('=' * 70)
def print_table(headers, rows, col_widths=None):
if not col_widths:
col_widths = [max(len(str(h)), max(len(str(r[i])) for r in rows) if rows else 0)
for i, h in enumerate(headers)]
header_line = " | ".join(str(h).ljust(w) for h, w in zip(headers, col_widths))
print(header_line)
print("-" * len(header_line))
for row in rows:
print(" | ".join(str(v).ljust(w) for v, w in zip(row, col_widths)))
def run_analysis(cursor):
results = {}
# 1. Most Common Tags
print_section("1. MOST COMMON TAGS")
cursor.execute("""
SELECT
t.name AS tag_name,
t.type AS tag_type,
COUNT(*) AS usage_count
FROM arc_puzzles_tags apt
JOIN tags t ON apt.tags_id = t.id
GROUP BY t.id, t.name, t.type
ORDER BY usage_count DESC
LIMIT 20
""")
rows = cursor.fetchall()
print_table(["Tag Name", "Type", "Usage"], rows, [40, 25, 10])
results['most_common'] = rows
# 2. Tag Correlations
print_section("2. TAG CORRELATIONS (tags that frequently appear together)")
cursor.execute("""
SELECT
t1.name AS tag1,
t2.name AS tag2,
COUNT(*) AS co_occurrence
FROM arc_puzzles_tags apt1
JOIN arc_puzzles_tags apt2 ON apt1.arc_puzzles_id = apt2.arc_puzzles_id AND apt1.tags_id < apt2.tags_id
JOIN tags t1 ON apt1.tags_id = t1.id
JOIN tags t2 ON apt2.tags_id = t2.id
GROUP BY t1.id, t2.id, t1.name, t2.name
HAVING co_occurrence >= 3
ORDER BY co_occurrence DESC
LIMIT 25
""")
rows = cursor.fetchall()
print_table(["Tag 1", "Tag 2", "Co-occurrences"], rows, [35, 35, 15])
results['correlations'] = rows
# 3. Possible Duplicate Tags
print_section("3. POSSIBLE DUPLICATE TAGS (similar names)")
cursor.execute("""
SELECT
t1.id AS id1,
t1.name AS name1,
t1.type AS type1,
t2.id AS id2,
t2.name AS name2,
t2.type AS type2
FROM tags t1
JOIN tags t2 ON t1.id < t2.id
WHERE
LOWER(REPLACE(t1.name, ' ', '')) = LOWER(REPLACE(t2.name, ' ', ''))
OR LOWER(t1.name) LIKE CONCAT('%', LOWER(t2.name), '%')
OR LOWER(t2.name) LIKE CONCAT('%', LOWER(t1.name), '%')
OR SOUNDEX(t1.name) = SOUNDEX(t2.name)
ORDER BY t1.name
""")
rows = cursor.fetchall()
print_table(["ID1", "Name 1", "Type 1", "ID2", "Name 2", "Type 2"], rows, [5, 30, 15, 5, 30, 15])
results['duplicates'] = rows
# 4. Tag Types Distribution
print_section("4. TAG TYPES DISTRIBUTION")
cursor.execute("""
SELECT
t.type AS tag_type,
COUNT(DISTINCT t.id) AS unique_tags,
COUNT(*) AS total_usage
FROM tags t
LEFT JOIN arc_puzzles_tags apt ON t.id = apt.tags_id
GROUP BY t.type
ORDER BY total_usage DESC
""")
rows = cursor.fetchall()
print_table(["Tag Type", "Unique Tags", "Total Usage"], rows, [25, 15, 15])
results['types'] = rows
# 5. Puzzles with Most Tags
print_section("5. PUZZLES WITH MOST TAGS")
cursor.execute("""
SELECT
apt.arc_puzzles_id,
COUNT(*) AS tag_count,
GROUP_CONCAT(t.name SEPARATOR ', ') AS tags
FROM arc_puzzles_tags apt
JOIN tags t ON apt.tags_id = t.id
GROUP BY apt.arc_puzzles_id
ORDER BY tag_count DESC
LIMIT 10
""")
rows = cursor.fetchall()
for row in rows:
print(f"\n{row[0]} ({row[1]} tags):")
print(f" {row[2][:100]}..." if len(row[2]) > 100 else f" {row[2]}")
results['most_tagged'] = rows
# 6. Puzzles with Fewest Tags
print_section("6. PUZZLES WITH FEWEST TAGS")
cursor.execute("""
SELECT
apt.arc_puzzles_id,
COUNT(*) AS tag_count,
GROUP_CONCAT(t.name SEPARATOR ', ') AS tags
FROM arc_puzzles_tags apt
JOIN tags t ON apt.tags_id = t.id
GROUP BY apt.arc_puzzles_id
ORDER BY tag_count ASC
LIMIT 10
""")
rows = cursor.fetchall()
for row in rows:
print(f"{row[0]} ({row[1]} tags): {row[2]}")
results['least_tagged'] = rows
# 7. Duplicate Tag Assignments (same tag assigned multiple times to same puzzle)
print_section("7. DUPLICATE TAG ASSIGNMENTS (same tag on same puzzle by different users)")
cursor.execute("""
SELECT
apt.arc_puzzles_id,
t.name AS tag_name,
COUNT(*) AS times_assigned,
GROUP_CONCAT(DISTINCT u.name SEPARATOR ', ') AS assigned_by_users
FROM arc_puzzles_tags apt
JOIN tags t ON apt.tags_id = t.id
LEFT JOIN users u ON apt.user_id = u.id
GROUP BY apt.arc_puzzles_id, apt.tags_id, t.name
HAVING times_assigned > 1
ORDER BY times_assigned DESC
LIMIT 15
""")
rows = cursor.fetchall()
print_table(["Puzzle ID", "Tag Name", "Times", "Assigned By"], rows, [15, 35, 8, 25])
results['duplicate_assignments'] = rows
# 8. User Tagging Activity
print_section("8. USER TAGGING ACTIVITY")
cursor.execute("""
SELECT
u.name AS user_name,
COUNT(*) AS tags_assigned,
COUNT(DISTINCT apt.arc_puzzles_id) AS puzzles_tagged,
COUNT(DISTINCT apt.tags_id) AS unique_tags_used
FROM arc_puzzles_tags apt
JOIN users u ON apt.user_id = u.id
GROUP BY u.id, u.name
ORDER BY tags_assigned DESC
""")
rows = cursor.fetchall()
print_table(["User", "Tags Assigned", "Puzzles Tagged", "Unique Tags Used"], rows, [20, 15, 15, 17])
results['user_activity'] = rows
# 9. Unused Tags
print_section("9. UNUSED TAGS (defined but never assigned)")
cursor.execute("""
SELECT
t.id,
t.name,
t.type,
COALESCE(t.description, '') as description
FROM tags t
LEFT JOIN arc_puzzles_tags apt ON t.id = apt.tags_id
WHERE apt.id IS NULL
ORDER BY t.type, t.name
""")
rows = cursor.fetchall()
print_table(["ID", "Name", "Type", "Description"], rows, [5, 25, 15, 40])
results['unused'] = rows
# 10. Tag Agreement Rate
print_section("10. TAG AGREEMENT RATE (when both users tagged same puzzle)")
cursor.execute("""
SELECT
t.name AS tag_name,
SUM(CASE WHEN user_count = 2 THEN 1 ELSE 0 END) AS both_users_agreed,
SUM(CASE WHEN user_count = 1 THEN 1 ELSE 0 END) AS only_one_user,
ROUND(SUM(CASE WHEN user_count = 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS agreement_pct
FROM (
SELECT apt.arc_puzzles_id, apt.tags_id, COUNT(DISTINCT apt.user_id) AS user_count
FROM arc_puzzles_tags apt
GROUP BY apt.arc_puzzles_id, apt.tags_id
) sub
JOIN tags t ON sub.tags_id = t.id
GROUP BY t.id, t.name
HAVING COUNT(*) >= 5
ORDER BY agreement_pct DESC
LIMIT 20
""")
rows = cursor.fetchall()
print_table(["Tag Name", "Both Agreed", "Only One", "Agreement %"], rows, [40, 12, 10, 12])
results['agreement'] = rows
# 11. High Agreement Puzzles (puzzles where users agreed on tags)
print_section("11. PUZZLES WITH HIGH TAG AGREEMENT")
cursor.execute("""
SELECT
sub.arc_puzzles_id,
SUM(CASE WHEN sub.user_count = 2 THEN 1 ELSE 0 END) AS agreed_tags,
SUM(CASE WHEN sub.user_count = 1 THEN 1 ELSE 0 END) AS single_user_tags,
COUNT(*) AS total_unique_tags,
ROUND(SUM(CASE WHEN sub.user_count = 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS agreement_pct
FROM (
SELECT arc_puzzles_id, tags_id, COUNT(DISTINCT user_id) AS user_count
FROM arc_puzzles_tags
GROUP BY arc_puzzles_id, tags_id
) sub
WHERE sub.arc_puzzles_id IN (
SELECT arc_puzzles_id
FROM arc_puzzles_tags
GROUP BY arc_puzzles_id
HAVING COUNT(DISTINCT user_id) > 1
)
GROUP BY sub.arc_puzzles_id
ORDER BY agreement_pct DESC, agreed_tags DESC
""")
rows = cursor.fetchall()
print_table(["Puzzle ID", "Agreed", "Single User", "Total Tags", "Agreement %"], rows, [15, 8, 12, 12, 12])
results['high_agreement_puzzles'] = rows
# 12. User Overlap Analysis
print_section("12. USER OVERLAP ANALYSIS")
cursor.execute("""
SELECT
(SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags apt
JOIN users u ON apt.user_id = u.id WHERE u.name = 'Bernardo') AS bernardo_total,
(SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags apt
JOIN users u ON apt.user_id = u.id WHERE u.name = 'Eric') AS eric_total,
(SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags
WHERE arc_puzzles_id IN (
SELECT arc_puzzles_id FROM arc_puzzles_tags
GROUP BY arc_puzzles_id HAVING COUNT(DISTINCT user_id) > 1
)) AS both_tagged
""")
row = cursor.fetchone()
print(f" Bernardo tagged: {row[0]} puzzles")
print(f" Eric tagged: {row[1]} puzzles")
print(f" Both tagged (overlap): {row[2]} puzzles")
print(f" Overlap rate: {round(row[2] * 100.0 / min(row[0], row[1]), 1)}%")
results['user_overlap'] = row
# 13. Detailed Agreement for Overlapping Puzzles
print_section("13. DETAILED TAG AGREEMENT FOR OVERLAPPING PUZZLES")
cursor.execute("""
SELECT
apt.arc_puzzles_id,
t.name AS tag_name,
t.type AS tag_type,
GROUP_CONCAT(DISTINCT u.name ORDER BY u.name SEPARATOR ', ') AS tagged_by,
COUNT(DISTINCT apt.user_id) AS user_count,
CASE WHEN COUNT(DISTINCT apt.user_id) = 2 THEN 'AGREED' ELSE 'SINGLE' END AS status
FROM arc_puzzles_tags apt
JOIN tags t ON apt.tags_id = t.id
JOIN users u ON apt.user_id = u.id
WHERE apt.arc_puzzles_id IN (
SELECT arc_puzzles_id
FROM arc_puzzles_tags
GROUP BY arc_puzzles_id
HAVING COUNT(DISTINCT user_id) > 1
)
GROUP BY apt.arc_puzzles_id, apt.tags_id, t.name, t.type
ORDER BY apt.arc_puzzles_id, user_count DESC, t.name
""")
rows = cursor.fetchall()
current_puzzle = None
for row in rows:
if row[0] != current_puzzle:
current_puzzle = row[0]
print(f"\n Puzzle: {current_puzzle}")
print(f" {'-' * 60}")
status_marker = "+" if row[5] == 'AGREED' else " "
print(f" {status_marker} {row[1]:<35} ({row[2]:<20}) - {row[3]}")
results['detailed_agreement'] = rows
# 14. Tags Users Tend to Disagree On
print_section("14. TAGS WITH MOST DISAGREEMENT (one user tags, other doesn't)")
cursor.execute("""
SELECT
t.name AS tag_name,
t.type AS tag_type,
u.name AS only_tagged_by,
COUNT(*) AS times_only_one_tagged
FROM arc_puzzles_tags apt
JOIN tags t ON apt.tags_id = t.id
JOIN users u ON apt.user_id = u.id
WHERE (apt.arc_puzzles_id, apt.tags_id) IN (
SELECT arc_puzzles_id, tags_id
FROM arc_puzzles_tags
GROUP BY arc_puzzles_id, tags_id
HAVING COUNT(DISTINCT user_id) = 1
)
AND apt.arc_puzzles_id IN (
SELECT arc_puzzles_id
FROM arc_puzzles_tags
GROUP BY arc_puzzles_id
HAVING COUNT(DISTINCT user_id) > 1
)
GROUP BY t.id, t.name, t.type, u.name
ORDER BY times_only_one_tagged DESC
LIMIT 20
""")
rows = cursor.fetchall()
print_table(["Tag Name", "Type", "Only Tagged By", "Times"], rows, [35, 20, 15, 8])
results['disagreement_tags'] = rows
# 15. Tags Users Consistently Agree On
print_section("15. TAGS USERS CONSISTENTLY AGREE ON")
cursor.execute("""
SELECT
t.name AS tag_name,
t.type AS tag_type,
COUNT(*) AS times_both_agreed
FROM (
SELECT arc_puzzles_id, tags_id
FROM arc_puzzles_tags
GROUP BY arc_puzzles_id, tags_id
HAVING COUNT(DISTINCT user_id) = 2
) agreed
JOIN tags t ON agreed.tags_id = t.id
GROUP BY t.id, t.name, t.type
ORDER BY times_both_agreed DESC
""")
rows = cursor.fetchall()
print_table(["Tag Name", "Type", "Times Agreed"], rows, [40, 20, 12])
results['agreement_tags'] = rows
return results
def main():
print("ARC Puzzles Tags Analysis")
print("=" * 70)
try:
config = load_env_config()
print(f"Connecting to {config['host']}:{config['port']}/{config['database']}...")
connection = pymysql.connect(**config)
print("Connected successfully!\n")
except Exception as e:
print(f"Error connecting to database: {e}")
return 1
try:
cursor = connection.cursor()
results = run_analysis(cursor)
# Summary
print_section("SUMMARY")
print(f"Total tag assignments analyzed")
cursor.execute("SELECT COUNT(*) FROM arc_puzzles_tags")
print(f" - Total assignments: {cursor.fetchone()[0]}")
cursor.execute("SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags")
print(f" - Puzzles with tags: {cursor.fetchone()[0]}")
cursor.execute("SELECT COUNT(*) FROM tags")
print(f" - Total tags defined: {cursor.fetchone()[0]}")
print(f" - Unused tags: {len(results['unused'])}")
print(f" - Possible duplicate tags: {len(results['duplicates'])}")
except Exception as e:
print(f"\nError during analysis: {e}")
import traceback
traceback.print_exc()
return 1
finally:
connection.close()
print(f"\nDatabase connection closed.")
return 0
if __name__ == '__main__':
sys.exit(main())

5
todo-2.md Normal file
View File

@ -0,0 +1,5 @@
- [x] Editing of Existing Answers
- [x] Editing of Existing Skills
- [x] Add the problem viewer to the User Eval Viewer.
- Refresh the list in the user eval viewer when selected
- In the "Explorer" tab I'd like to see a way to link to the annotations for each retrieved example.