#!/usr/bin/env python3 """ Find duplicate puzzle IDs in arc_puzzles table """ import os import pymysql from dotenv import load_dotenv def load_env_config(): """Load database configuration from .env file""" load_dotenv() return { 'host': os.getenv('DB_HOST'), 'user': os.getenv('DB_USER'), 'password': os.getenv('DB_PASSWORD'), 'database': os.getenv('DB_NAME'), 'port': int(os.getenv('DB_PORT', 3306)), 'charset': 'utf8mb4' } def main(): print("Finding Duplicate IDs in arc_puzzles") print("=" * 50) config = load_env_config() connection = pymysql.connect(**config) cursor = connection.cursor() # Find all duplicate IDs cursor.execute(""" SELECT id, corpora, `set`, COUNT(*) as count FROM arc_puzzles GROUP BY id, corpora, `set` HAVING COUNT(*) > 1 ORDER BY count DESC, id """) duplicates = cursor.fetchall() if duplicates: print(f"\n⚠ Found {len(duplicates)} duplicate entries:") print(f"{'ID':<20} {'Corpora':<15} {'Set':<20} {'Count':<10}") print("-" * 70) for puzzle_id, corpora, set_name, count in duplicates: print(f"{puzzle_id:<20} {corpora:<15} {set_name:<20} {count:<10}") # Calculate totals total_duplicates = sum(count - 1 for _, _, _, count in duplicates) print(f"\nTotal duplicate rows to remove: {total_duplicates}") # Check ConceptArc specifically conceptarc_dups = [d for d in duplicates if d[1] == 'ConceptArc'] if conceptarc_dups: print(f"\nConceptArc duplicates: {len(conceptarc_dups)} unique IDs") conceptarc_dup_count = sum(count - 1 for _, _, _, count in conceptarc_dups) print(f"ConceptArc duplicate rows to remove: {conceptarc_dup_count}") else: print("\n✓ No duplicates found!") # Show total counts print("\n" + "=" * 50) print("Current table statistics:") print("=" * 50) cursor.execute("SELECT COUNT(*) FROM arc_puzzles") total = cursor.fetchone()[0] print(f"Total rows in arc_puzzles: {total}") cursor.execute(""" SELECT corpora, COUNT(*) as count FROM arc_puzzles GROUP BY corpora ORDER BY corpora """) print("\nBy corpora:") for corpora, count in cursor.fetchall(): print(f" {corpora}: {count}") connection.close() if __name__ == '__main__': main()