tag analysis
This commit is contained in:
410
scripts/analyze_tags.py
Executable file
410
scripts/analyze_tags.py
Executable file
@ -0,0 +1,410 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Analyze arc_puzzles_tags table for patterns, correlations, and duplicates.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pymysql
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
def load_env_config():
|
||||||
|
"""Load database configuration from .env file"""
|
||||||
|
load_dotenv()
|
||||||
|
return {
|
||||||
|
'host': os.getenv('DB_HOST'),
|
||||||
|
'user': os.getenv('DB_USER'),
|
||||||
|
'password': os.getenv('DB_PASSWORD'),
|
||||||
|
'database': os.getenv('DB_NAME'),
|
||||||
|
'port': int(os.getenv('DB_PORT', 3306)),
|
||||||
|
'charset': 'utf8mb4'
|
||||||
|
}
|
||||||
|
|
||||||
|
def print_section(title):
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print(f" {title}")
|
||||||
|
print('=' * 70)
|
||||||
|
|
||||||
|
def print_table(headers, rows, col_widths=None):
|
||||||
|
if not col_widths:
|
||||||
|
col_widths = [max(len(str(h)), max(len(str(r[i])) for r in rows) if rows else 0)
|
||||||
|
for i, h in enumerate(headers)]
|
||||||
|
|
||||||
|
header_line = " | ".join(str(h).ljust(w) for h, w in zip(headers, col_widths))
|
||||||
|
print(header_line)
|
||||||
|
print("-" * len(header_line))
|
||||||
|
for row in rows:
|
||||||
|
print(" | ".join(str(v).ljust(w) for v, w in zip(row, col_widths)))
|
||||||
|
|
||||||
|
def run_analysis(cursor):
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
# 1. Most Common Tags
|
||||||
|
print_section("1. MOST COMMON TAGS")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
t.name AS tag_name,
|
||||||
|
t.type AS tag_type,
|
||||||
|
COUNT(*) AS usage_count
|
||||||
|
FROM arc_puzzles_tags apt
|
||||||
|
JOIN tags t ON apt.tags_id = t.id
|
||||||
|
GROUP BY t.id, t.name, t.type
|
||||||
|
ORDER BY usage_count DESC
|
||||||
|
LIMIT 20
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["Tag Name", "Type", "Usage"], rows, [40, 25, 10])
|
||||||
|
results['most_common'] = rows
|
||||||
|
|
||||||
|
# 2. Tag Correlations
|
||||||
|
print_section("2. TAG CORRELATIONS (tags that frequently appear together)")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
t1.name AS tag1,
|
||||||
|
t2.name AS tag2,
|
||||||
|
COUNT(*) AS co_occurrence
|
||||||
|
FROM arc_puzzles_tags apt1
|
||||||
|
JOIN arc_puzzles_tags apt2 ON apt1.arc_puzzles_id = apt2.arc_puzzles_id AND apt1.tags_id < apt2.tags_id
|
||||||
|
JOIN tags t1 ON apt1.tags_id = t1.id
|
||||||
|
JOIN tags t2 ON apt2.tags_id = t2.id
|
||||||
|
GROUP BY t1.id, t2.id, t1.name, t2.name
|
||||||
|
HAVING co_occurrence >= 3
|
||||||
|
ORDER BY co_occurrence DESC
|
||||||
|
LIMIT 25
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["Tag 1", "Tag 2", "Co-occurrences"], rows, [35, 35, 15])
|
||||||
|
results['correlations'] = rows
|
||||||
|
|
||||||
|
# 3. Possible Duplicate Tags
|
||||||
|
print_section("3. POSSIBLE DUPLICATE TAGS (similar names)")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
t1.id AS id1,
|
||||||
|
t1.name AS name1,
|
||||||
|
t1.type AS type1,
|
||||||
|
t2.id AS id2,
|
||||||
|
t2.name AS name2,
|
||||||
|
t2.type AS type2
|
||||||
|
FROM tags t1
|
||||||
|
JOIN tags t2 ON t1.id < t2.id
|
||||||
|
WHERE
|
||||||
|
LOWER(REPLACE(t1.name, ' ', '')) = LOWER(REPLACE(t2.name, ' ', ''))
|
||||||
|
OR LOWER(t1.name) LIKE CONCAT('%', LOWER(t2.name), '%')
|
||||||
|
OR LOWER(t2.name) LIKE CONCAT('%', LOWER(t1.name), '%')
|
||||||
|
OR SOUNDEX(t1.name) = SOUNDEX(t2.name)
|
||||||
|
ORDER BY t1.name
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["ID1", "Name 1", "Type 1", "ID2", "Name 2", "Type 2"], rows, [5, 30, 15, 5, 30, 15])
|
||||||
|
results['duplicates'] = rows
|
||||||
|
|
||||||
|
# 4. Tag Types Distribution
|
||||||
|
print_section("4. TAG TYPES DISTRIBUTION")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
t.type AS tag_type,
|
||||||
|
COUNT(DISTINCT t.id) AS unique_tags,
|
||||||
|
COUNT(*) AS total_usage
|
||||||
|
FROM tags t
|
||||||
|
LEFT JOIN arc_puzzles_tags apt ON t.id = apt.tags_id
|
||||||
|
GROUP BY t.type
|
||||||
|
ORDER BY total_usage DESC
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["Tag Type", "Unique Tags", "Total Usage"], rows, [25, 15, 15])
|
||||||
|
results['types'] = rows
|
||||||
|
|
||||||
|
# 5. Puzzles with Most Tags
|
||||||
|
print_section("5. PUZZLES WITH MOST TAGS")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
apt.arc_puzzles_id,
|
||||||
|
COUNT(*) AS tag_count,
|
||||||
|
GROUP_CONCAT(t.name SEPARATOR ', ') AS tags
|
||||||
|
FROM arc_puzzles_tags apt
|
||||||
|
JOIN tags t ON apt.tags_id = t.id
|
||||||
|
GROUP BY apt.arc_puzzles_id
|
||||||
|
ORDER BY tag_count DESC
|
||||||
|
LIMIT 10
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
for row in rows:
|
||||||
|
print(f"\n{row[0]} ({row[1]} tags):")
|
||||||
|
print(f" {row[2][:100]}..." if len(row[2]) > 100 else f" {row[2]}")
|
||||||
|
results['most_tagged'] = rows
|
||||||
|
|
||||||
|
# 6. Puzzles with Fewest Tags
|
||||||
|
print_section("6. PUZZLES WITH FEWEST TAGS")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
apt.arc_puzzles_id,
|
||||||
|
COUNT(*) AS tag_count,
|
||||||
|
GROUP_CONCAT(t.name SEPARATOR ', ') AS tags
|
||||||
|
FROM arc_puzzles_tags apt
|
||||||
|
JOIN tags t ON apt.tags_id = t.id
|
||||||
|
GROUP BY apt.arc_puzzles_id
|
||||||
|
ORDER BY tag_count ASC
|
||||||
|
LIMIT 10
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
for row in rows:
|
||||||
|
print(f"{row[0]} ({row[1]} tags): {row[2]}")
|
||||||
|
results['least_tagged'] = rows
|
||||||
|
|
||||||
|
# 7. Duplicate Tag Assignments (same tag assigned multiple times to same puzzle)
|
||||||
|
print_section("7. DUPLICATE TAG ASSIGNMENTS (same tag on same puzzle by different users)")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
apt.arc_puzzles_id,
|
||||||
|
t.name AS tag_name,
|
||||||
|
COUNT(*) AS times_assigned,
|
||||||
|
GROUP_CONCAT(DISTINCT u.name SEPARATOR ', ') AS assigned_by_users
|
||||||
|
FROM arc_puzzles_tags apt
|
||||||
|
JOIN tags t ON apt.tags_id = t.id
|
||||||
|
LEFT JOIN users u ON apt.user_id = u.id
|
||||||
|
GROUP BY apt.arc_puzzles_id, apt.tags_id, t.name
|
||||||
|
HAVING times_assigned > 1
|
||||||
|
ORDER BY times_assigned DESC
|
||||||
|
LIMIT 15
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["Puzzle ID", "Tag Name", "Times", "Assigned By"], rows, [15, 35, 8, 25])
|
||||||
|
results['duplicate_assignments'] = rows
|
||||||
|
|
||||||
|
# 8. User Tagging Activity
|
||||||
|
print_section("8. USER TAGGING ACTIVITY")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
u.name AS user_name,
|
||||||
|
COUNT(*) AS tags_assigned,
|
||||||
|
COUNT(DISTINCT apt.arc_puzzles_id) AS puzzles_tagged,
|
||||||
|
COUNT(DISTINCT apt.tags_id) AS unique_tags_used
|
||||||
|
FROM arc_puzzles_tags apt
|
||||||
|
JOIN users u ON apt.user_id = u.id
|
||||||
|
GROUP BY u.id, u.name
|
||||||
|
ORDER BY tags_assigned DESC
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["User", "Tags Assigned", "Puzzles Tagged", "Unique Tags Used"], rows, [20, 15, 15, 17])
|
||||||
|
results['user_activity'] = rows
|
||||||
|
|
||||||
|
# 9. Unused Tags
|
||||||
|
print_section("9. UNUSED TAGS (defined but never assigned)")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
t.id,
|
||||||
|
t.name,
|
||||||
|
t.type,
|
||||||
|
COALESCE(t.description, '') as description
|
||||||
|
FROM tags t
|
||||||
|
LEFT JOIN arc_puzzles_tags apt ON t.id = apt.tags_id
|
||||||
|
WHERE apt.id IS NULL
|
||||||
|
ORDER BY t.type, t.name
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["ID", "Name", "Type", "Description"], rows, [5, 25, 15, 40])
|
||||||
|
results['unused'] = rows
|
||||||
|
|
||||||
|
# 10. Tag Agreement Rate
|
||||||
|
print_section("10. TAG AGREEMENT RATE (when both users tagged same puzzle)")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
t.name AS tag_name,
|
||||||
|
SUM(CASE WHEN user_count = 2 THEN 1 ELSE 0 END) AS both_users_agreed,
|
||||||
|
SUM(CASE WHEN user_count = 1 THEN 1 ELSE 0 END) AS only_one_user,
|
||||||
|
ROUND(SUM(CASE WHEN user_count = 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS agreement_pct
|
||||||
|
FROM (
|
||||||
|
SELECT apt.arc_puzzles_id, apt.tags_id, COUNT(DISTINCT apt.user_id) AS user_count
|
||||||
|
FROM arc_puzzles_tags apt
|
||||||
|
GROUP BY apt.arc_puzzles_id, apt.tags_id
|
||||||
|
) sub
|
||||||
|
JOIN tags t ON sub.tags_id = t.id
|
||||||
|
GROUP BY t.id, t.name
|
||||||
|
HAVING COUNT(*) >= 5
|
||||||
|
ORDER BY agreement_pct DESC
|
||||||
|
LIMIT 20
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["Tag Name", "Both Agreed", "Only One", "Agreement %"], rows, [40, 12, 10, 12])
|
||||||
|
results['agreement'] = rows
|
||||||
|
|
||||||
|
# 11. High Agreement Puzzles (puzzles where users agreed on tags)
|
||||||
|
print_section("11. PUZZLES WITH HIGH TAG AGREEMENT")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
sub.arc_puzzles_id,
|
||||||
|
SUM(CASE WHEN sub.user_count = 2 THEN 1 ELSE 0 END) AS agreed_tags,
|
||||||
|
SUM(CASE WHEN sub.user_count = 1 THEN 1 ELSE 0 END) AS single_user_tags,
|
||||||
|
COUNT(*) AS total_unique_tags,
|
||||||
|
ROUND(SUM(CASE WHEN sub.user_count = 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS agreement_pct
|
||||||
|
FROM (
|
||||||
|
SELECT arc_puzzles_id, tags_id, COUNT(DISTINCT user_id) AS user_count
|
||||||
|
FROM arc_puzzles_tags
|
||||||
|
GROUP BY arc_puzzles_id, tags_id
|
||||||
|
) sub
|
||||||
|
WHERE sub.arc_puzzles_id IN (
|
||||||
|
SELECT arc_puzzles_id
|
||||||
|
FROM arc_puzzles_tags
|
||||||
|
GROUP BY arc_puzzles_id
|
||||||
|
HAVING COUNT(DISTINCT user_id) > 1
|
||||||
|
)
|
||||||
|
GROUP BY sub.arc_puzzles_id
|
||||||
|
ORDER BY agreement_pct DESC, agreed_tags DESC
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["Puzzle ID", "Agreed", "Single User", "Total Tags", "Agreement %"], rows, [15, 8, 12, 12, 12])
|
||||||
|
results['high_agreement_puzzles'] = rows
|
||||||
|
|
||||||
|
# 12. User Overlap Analysis
|
||||||
|
print_section("12. USER OVERLAP ANALYSIS")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
(SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags apt
|
||||||
|
JOIN users u ON apt.user_id = u.id WHERE u.name = 'Bernardo') AS bernardo_total,
|
||||||
|
(SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags apt
|
||||||
|
JOIN users u ON apt.user_id = u.id WHERE u.name = 'Eric') AS eric_total,
|
||||||
|
(SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags
|
||||||
|
WHERE arc_puzzles_id IN (
|
||||||
|
SELECT arc_puzzles_id FROM arc_puzzles_tags
|
||||||
|
GROUP BY arc_puzzles_id HAVING COUNT(DISTINCT user_id) > 1
|
||||||
|
)) AS both_tagged
|
||||||
|
""")
|
||||||
|
row = cursor.fetchone()
|
||||||
|
print(f" Bernardo tagged: {row[0]} puzzles")
|
||||||
|
print(f" Eric tagged: {row[1]} puzzles")
|
||||||
|
print(f" Both tagged (overlap): {row[2]} puzzles")
|
||||||
|
print(f" Overlap rate: {round(row[2] * 100.0 / min(row[0], row[1]), 1)}%")
|
||||||
|
results['user_overlap'] = row
|
||||||
|
|
||||||
|
# 13. Detailed Agreement for Overlapping Puzzles
|
||||||
|
print_section("13. DETAILED TAG AGREEMENT FOR OVERLAPPING PUZZLES")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
apt.arc_puzzles_id,
|
||||||
|
t.name AS tag_name,
|
||||||
|
t.type AS tag_type,
|
||||||
|
GROUP_CONCAT(DISTINCT u.name ORDER BY u.name SEPARATOR ', ') AS tagged_by,
|
||||||
|
COUNT(DISTINCT apt.user_id) AS user_count,
|
||||||
|
CASE WHEN COUNT(DISTINCT apt.user_id) = 2 THEN 'AGREED' ELSE 'SINGLE' END AS status
|
||||||
|
FROM arc_puzzles_tags apt
|
||||||
|
JOIN tags t ON apt.tags_id = t.id
|
||||||
|
JOIN users u ON apt.user_id = u.id
|
||||||
|
WHERE apt.arc_puzzles_id IN (
|
||||||
|
SELECT arc_puzzles_id
|
||||||
|
FROM arc_puzzles_tags
|
||||||
|
GROUP BY arc_puzzles_id
|
||||||
|
HAVING COUNT(DISTINCT user_id) > 1
|
||||||
|
)
|
||||||
|
GROUP BY apt.arc_puzzles_id, apt.tags_id, t.name, t.type
|
||||||
|
ORDER BY apt.arc_puzzles_id, user_count DESC, t.name
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
|
||||||
|
current_puzzle = None
|
||||||
|
for row in rows:
|
||||||
|
if row[0] != current_puzzle:
|
||||||
|
current_puzzle = row[0]
|
||||||
|
print(f"\n Puzzle: {current_puzzle}")
|
||||||
|
print(f" {'-' * 60}")
|
||||||
|
status_marker = "+" if row[5] == 'AGREED' else " "
|
||||||
|
print(f" {status_marker} {row[1]:<35} ({row[2]:<20}) - {row[3]}")
|
||||||
|
results['detailed_agreement'] = rows
|
||||||
|
|
||||||
|
# 14. Tags Users Tend to Disagree On
|
||||||
|
print_section("14. TAGS WITH MOST DISAGREEMENT (one user tags, other doesn't)")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
t.name AS tag_name,
|
||||||
|
t.type AS tag_type,
|
||||||
|
u.name AS only_tagged_by,
|
||||||
|
COUNT(*) AS times_only_one_tagged
|
||||||
|
FROM arc_puzzles_tags apt
|
||||||
|
JOIN tags t ON apt.tags_id = t.id
|
||||||
|
JOIN users u ON apt.user_id = u.id
|
||||||
|
WHERE (apt.arc_puzzles_id, apt.tags_id) IN (
|
||||||
|
SELECT arc_puzzles_id, tags_id
|
||||||
|
FROM arc_puzzles_tags
|
||||||
|
GROUP BY arc_puzzles_id, tags_id
|
||||||
|
HAVING COUNT(DISTINCT user_id) = 1
|
||||||
|
)
|
||||||
|
AND apt.arc_puzzles_id IN (
|
||||||
|
SELECT arc_puzzles_id
|
||||||
|
FROM arc_puzzles_tags
|
||||||
|
GROUP BY arc_puzzles_id
|
||||||
|
HAVING COUNT(DISTINCT user_id) > 1
|
||||||
|
)
|
||||||
|
GROUP BY t.id, t.name, t.type, u.name
|
||||||
|
ORDER BY times_only_one_tagged DESC
|
||||||
|
LIMIT 20
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["Tag Name", "Type", "Only Tagged By", "Times"], rows, [35, 20, 15, 8])
|
||||||
|
results['disagreement_tags'] = rows
|
||||||
|
|
||||||
|
# 15. Tags Users Consistently Agree On
|
||||||
|
print_section("15. TAGS USERS CONSISTENTLY AGREE ON")
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT
|
||||||
|
t.name AS tag_name,
|
||||||
|
t.type AS tag_type,
|
||||||
|
COUNT(*) AS times_both_agreed
|
||||||
|
FROM (
|
||||||
|
SELECT arc_puzzles_id, tags_id
|
||||||
|
FROM arc_puzzles_tags
|
||||||
|
GROUP BY arc_puzzles_id, tags_id
|
||||||
|
HAVING COUNT(DISTINCT user_id) = 2
|
||||||
|
) agreed
|
||||||
|
JOIN tags t ON agreed.tags_id = t.id
|
||||||
|
GROUP BY t.id, t.name, t.type
|
||||||
|
ORDER BY times_both_agreed DESC
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
print_table(["Tag Name", "Type", "Times Agreed"], rows, [40, 20, 12])
|
||||||
|
results['agreement_tags'] = rows
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("ARC Puzzles Tags Analysis")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
try:
|
||||||
|
config = load_env_config()
|
||||||
|
print(f"Connecting to {config['host']}:{config['port']}/{config['database']}...")
|
||||||
|
connection = pymysql.connect(**config)
|
||||||
|
print("Connected successfully!\n")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error connecting to database: {e}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
cursor = connection.cursor()
|
||||||
|
results = run_analysis(cursor)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print_section("SUMMARY")
|
||||||
|
print(f"Total tag assignments analyzed")
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM arc_puzzles_tags")
|
||||||
|
print(f" - Total assignments: {cursor.fetchone()[0]}")
|
||||||
|
cursor.execute("SELECT COUNT(DISTINCT arc_puzzles_id) FROM arc_puzzles_tags")
|
||||||
|
print(f" - Puzzles with tags: {cursor.fetchone()[0]}")
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM tags")
|
||||||
|
print(f" - Total tags defined: {cursor.fetchone()[0]}")
|
||||||
|
print(f" - Unused tags: {len(results['unused'])}")
|
||||||
|
print(f" - Possible duplicate tags: {len(results['duplicates'])}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\nError during analysis: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return 1
|
||||||
|
finally:
|
||||||
|
connection.close()
|
||||||
|
print(f"\nDatabase connection closed.")
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
5
todo-2.md
Normal file
5
todo-2.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
- [x] Editing of Existing Answers
|
||||||
|
- [x] Editing of Existing Skills
|
||||||
|
- [x] Add the problem viewer to the User Eval Viewer.
|
||||||
|
- Refresh the list in the user eval viewer when selected
|
||||||
|
- In the "Explorer" tab I'd like to see a way to link to the annotations for each retrieved example.
|
||||||
Reference in New Issue
Block a user