Files
arc-humans-interface-db/scripts/find_duplicates.py
2025-11-05 13:28:23 +00:00

83 lines
2.4 KiB
Python

#!/usr/bin/env python3
"""
Find duplicate puzzle IDs in arc_puzzles table
"""
import os
import pymysql
from dotenv import load_dotenv
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
return {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
def main():
print("Finding Duplicate IDs in arc_puzzles")
print("=" * 50)
config = load_env_config()
connection = pymysql.connect(**config)
cursor = connection.cursor()
# Find all duplicate IDs
cursor.execute("""
SELECT id, corpora, `set`, COUNT(*) as count
FROM arc_puzzles
GROUP BY id, corpora, `set`
HAVING COUNT(*) > 1
ORDER BY count DESC, id
""")
duplicates = cursor.fetchall()
if duplicates:
print(f"\n⚠ Found {len(duplicates)} duplicate entries:")
print(f"{'ID':<20} {'Corpora':<15} {'Set':<20} {'Count':<10}")
print("-" * 70)
for puzzle_id, corpora, set_name, count in duplicates:
print(f"{puzzle_id:<20} {corpora:<15} {set_name:<20} {count:<10}")
# Calculate totals
total_duplicates = sum(count - 1 for _, _, _, count in duplicates)
print(f"\nTotal duplicate rows to remove: {total_duplicates}")
# Check ConceptArc specifically
conceptarc_dups = [d for d in duplicates if d[1] == 'ConceptArc']
if conceptarc_dups:
print(f"\nConceptArc duplicates: {len(conceptarc_dups)} unique IDs")
conceptarc_dup_count = sum(count - 1 for _, _, _, count in conceptarc_dups)
print(f"ConceptArc duplicate rows to remove: {conceptarc_dup_count}")
else:
print("\n✓ No duplicates found!")
# Show total counts
print("\n" + "=" * 50)
print("Current table statistics:")
print("=" * 50)
cursor.execute("SELECT COUNT(*) FROM arc_puzzles")
total = cursor.fetchone()[0]
print(f"Total rows in arc_puzzles: {total}")
cursor.execute("""
SELECT corpora, COUNT(*) as count
FROM arc_puzzles
GROUP BY corpora
ORDER BY corpora
""")
print("\nBy corpora:")
for corpora, count in cursor.fetchall():
print(f" {corpora}: {count}")
connection.close()
if __name__ == '__main__':
main()