83 lines
2.4 KiB
Python
83 lines
2.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Find duplicate puzzle IDs in arc_puzzles table
|
|
"""
|
|
|
|
import os
|
|
import pymysql
|
|
from dotenv import load_dotenv
|
|
|
|
def load_env_config():
|
|
"""Load database configuration from .env file"""
|
|
load_dotenv()
|
|
return {
|
|
'host': os.getenv('DB_HOST'),
|
|
'user': os.getenv('DB_USER'),
|
|
'password': os.getenv('DB_PASSWORD'),
|
|
'database': os.getenv('DB_NAME'),
|
|
'port': int(os.getenv('DB_PORT', 3306)),
|
|
'charset': 'utf8mb4'
|
|
}
|
|
|
|
def main():
|
|
print("Finding Duplicate IDs in arc_puzzles")
|
|
print("=" * 50)
|
|
|
|
config = load_env_config()
|
|
connection = pymysql.connect(**config)
|
|
cursor = connection.cursor()
|
|
|
|
# Find all duplicate IDs
|
|
cursor.execute("""
|
|
SELECT id, corpora, `set`, COUNT(*) as count
|
|
FROM arc_puzzles
|
|
GROUP BY id, corpora, `set`
|
|
HAVING COUNT(*) > 1
|
|
ORDER BY count DESC, id
|
|
""")
|
|
|
|
duplicates = cursor.fetchall()
|
|
|
|
if duplicates:
|
|
print(f"\n⚠ Found {len(duplicates)} duplicate entries:")
|
|
print(f"{'ID':<20} {'Corpora':<15} {'Set':<20} {'Count':<10}")
|
|
print("-" * 70)
|
|
for puzzle_id, corpora, set_name, count in duplicates:
|
|
print(f"{puzzle_id:<20} {corpora:<15} {set_name:<20} {count:<10}")
|
|
|
|
# Calculate totals
|
|
total_duplicates = sum(count - 1 for _, _, _, count in duplicates)
|
|
print(f"\nTotal duplicate rows to remove: {total_duplicates}")
|
|
|
|
# Check ConceptArc specifically
|
|
conceptarc_dups = [d for d in duplicates if d[1] == 'ConceptArc']
|
|
if conceptarc_dups:
|
|
print(f"\nConceptArc duplicates: {len(conceptarc_dups)} unique IDs")
|
|
conceptarc_dup_count = sum(count - 1 for _, _, _, count in conceptarc_dups)
|
|
print(f"ConceptArc duplicate rows to remove: {conceptarc_dup_count}")
|
|
else:
|
|
print("\n✓ No duplicates found!")
|
|
|
|
# Show total counts
|
|
print("\n" + "=" * 50)
|
|
print("Current table statistics:")
|
|
print("=" * 50)
|
|
cursor.execute("SELECT COUNT(*) FROM arc_puzzles")
|
|
total = cursor.fetchone()[0]
|
|
print(f"Total rows in arc_puzzles: {total}")
|
|
|
|
cursor.execute("""
|
|
SELECT corpora, COUNT(*) as count
|
|
FROM arc_puzzles
|
|
GROUP BY corpora
|
|
ORDER BY corpora
|
|
""")
|
|
print("\nBy corpora:")
|
|
for corpora, count in cursor.fetchall():
|
|
print(f" {corpora}: {count}")
|
|
|
|
connection.close()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|