#!/usr/bin/env python3 """ Remove duplicate entries from arc_puzzles table Keeps only the first occurrence of each unique (id, corpora, set) combination """ import os import sys import pymysql from dotenv import load_dotenv def load_env_config(): """Load database configuration from .env file""" load_dotenv() return { 'host': os.getenv('DB_HOST'), 'user': os.getenv('DB_USER'), 'password': os.getenv('DB_PASSWORD'), 'database': os.getenv('DB_NAME'), 'port': int(os.getenv('DB_PORT', 3306)), 'charset': 'utf8mb4' } def main(): print("Removing Duplicates from arc_puzzles Table") print("=" * 50) config = load_env_config() connection = pymysql.connect(**config) cursor = connection.cursor() try: # First, check if arc_puzzles has a primary key or unique identifier cursor.execute("DESCRIBE arc_puzzles") columns = cursor.fetchall() print("\nTable structure:") for col in columns: print(f" {col[0]:<20} {col[1]:<20} Key:{col[3]}") # Find duplicates cursor.execute(""" SELECT id, corpora, `set`, COUNT(*) as count FROM arc_puzzles GROUP BY id, corpora, `set` HAVING COUNT(*) > 1 ORDER BY id """) duplicates = cursor.fetchall() if not duplicates: print("\n✓ No duplicates found!") return 0 print(f"\nFound {len(duplicates)} sets of duplicates") total_to_remove = sum(count - 1 for _, _, _, count in duplicates) print(f"Total rows to remove: {total_to_remove}") # Ask for confirmation if '--yes' not in sys.argv: print(f"\n⚠ This will delete {total_to_remove} duplicate rows") response = input("Continue? (yes/no): ").strip().lower() if response not in ['yes', 'y']: print("Operation cancelled") return 0 else: print(f"\n⚠ Deleting {total_to_remove} duplicate rows (auto-confirmed with --yes flag)") # For each duplicate set, keep only one and delete the rest # Since there's no auto-increment primary key, we'll use a different approach # We'll create a temporary table with unique entries, then replace the original print("\nRemoving duplicates...") removed_count = 0 for puzzle_id, corpora, set_name, count in duplicates: if count > 1: # Keep one, delete the extras # We delete (count - 1) duplicates delete_count = count - 1 # Delete using LIMIT to remove only the extra copies cursor.execute(""" DELETE FROM arc_puzzles WHERE id = %s AND corpora = %s AND `set` = %s LIMIT %s """, (puzzle_id, corpora, set_name, delete_count)) removed_count += cursor.rowcount if removed_count % 50 == 0: print(f" Removed {removed_count}/{total_to_remove} duplicates...") # Commit the changes connection.commit() print(f"\n{'=' * 50}") print(f"✓ Duplicate removal complete!") print(f" Total duplicates removed: {removed_count}") # Verify no duplicates remain cursor.execute(""" SELECT COUNT(*) FROM ( SELECT id, corpora, `set`, COUNT(*) as count FROM arc_puzzles GROUP BY id, corpora, `set` HAVING COUNT(*) > 1 ) as dups """) remaining_dups = cursor.fetchone()[0] if remaining_dups > 0: print(f"\n⚠ Warning: {remaining_dups} duplicate sets still remain") else: print(f"\n✓ No duplicates remaining!") # Show final counts cursor.execute("SELECT COUNT(*) FROM arc_puzzles") final_count = cursor.fetchone()[0] print(f"\nFinal table size: {final_count} rows") cursor.execute(""" SELECT corpora, COUNT(*) as count FROM arc_puzzles GROUP BY corpora ORDER BY corpora """) print("\nBy corpora:") for corpora, count in cursor.fetchall(): print(f" {corpora}: {count}") except Exception as e: connection.rollback() print(f"\n✗ Error: {e}") import traceback traceback.print_exc() return 1 finally: connection.close() print(f"\n✓ Database connection closed") return 0 if __name__ == '__main__': sys.exit(main())