147 lines
4.6 KiB
Python
Executable File
147 lines
4.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Remove duplicate entries from arc_puzzles table
|
|
Keeps only the first occurrence of each unique (id, corpora, set) combination
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import pymysql
|
|
from dotenv import load_dotenv
|
|
|
|
def load_env_config():
|
|
"""Load database configuration from .env file"""
|
|
load_dotenv()
|
|
return {
|
|
'host': os.getenv('DB_HOST'),
|
|
'user': os.getenv('DB_USER'),
|
|
'password': os.getenv('DB_PASSWORD'),
|
|
'database': os.getenv('DB_NAME'),
|
|
'port': int(os.getenv('DB_PORT', 3306)),
|
|
'charset': 'utf8mb4'
|
|
}
|
|
|
|
def main():
|
|
print("Removing Duplicates from arc_puzzles Table")
|
|
print("=" * 50)
|
|
|
|
config = load_env_config()
|
|
connection = pymysql.connect(**config)
|
|
cursor = connection.cursor()
|
|
|
|
try:
|
|
# First, check if arc_puzzles has a primary key or unique identifier
|
|
cursor.execute("DESCRIBE arc_puzzles")
|
|
columns = cursor.fetchall()
|
|
print("\nTable structure:")
|
|
for col in columns:
|
|
print(f" {col[0]:<20} {col[1]:<20} Key:{col[3]}")
|
|
|
|
# Find duplicates
|
|
cursor.execute("""
|
|
SELECT id, corpora, `set`, COUNT(*) as count
|
|
FROM arc_puzzles
|
|
GROUP BY id, corpora, `set`
|
|
HAVING COUNT(*) > 1
|
|
ORDER BY id
|
|
""")
|
|
|
|
duplicates = cursor.fetchall()
|
|
|
|
if not duplicates:
|
|
print("\n✓ No duplicates found!")
|
|
return 0
|
|
|
|
print(f"\nFound {len(duplicates)} sets of duplicates")
|
|
total_to_remove = sum(count - 1 for _, _, _, count in duplicates)
|
|
print(f"Total rows to remove: {total_to_remove}")
|
|
|
|
# Ask for confirmation
|
|
if '--yes' not in sys.argv:
|
|
print(f"\n⚠ This will delete {total_to_remove} duplicate rows")
|
|
response = input("Continue? (yes/no): ").strip().lower()
|
|
if response not in ['yes', 'y']:
|
|
print("Operation cancelled")
|
|
return 0
|
|
else:
|
|
print(f"\n⚠ Deleting {total_to_remove} duplicate rows (auto-confirmed with --yes flag)")
|
|
|
|
# For each duplicate set, keep only one and delete the rest
|
|
# Since there's no auto-increment primary key, we'll use a different approach
|
|
# We'll create a temporary table with unique entries, then replace the original
|
|
|
|
print("\nRemoving duplicates...")
|
|
removed_count = 0
|
|
|
|
for puzzle_id, corpora, set_name, count in duplicates:
|
|
if count > 1:
|
|
# Keep one, delete the extras
|
|
# We delete (count - 1) duplicates
|
|
delete_count = count - 1
|
|
|
|
# Delete using LIMIT to remove only the extra copies
|
|
cursor.execute("""
|
|
DELETE FROM arc_puzzles
|
|
WHERE id = %s AND corpora = %s AND `set` = %s
|
|
LIMIT %s
|
|
""", (puzzle_id, corpora, set_name, delete_count))
|
|
|
|
removed_count += cursor.rowcount
|
|
|
|
if removed_count % 50 == 0:
|
|
print(f" Removed {removed_count}/{total_to_remove} duplicates...")
|
|
|
|
# Commit the changes
|
|
connection.commit()
|
|
|
|
print(f"\n{'=' * 50}")
|
|
print(f"✓ Duplicate removal complete!")
|
|
print(f" Total duplicates removed: {removed_count}")
|
|
|
|
# Verify no duplicates remain
|
|
cursor.execute("""
|
|
SELECT COUNT(*)
|
|
FROM (
|
|
SELECT id, corpora, `set`, COUNT(*) as count
|
|
FROM arc_puzzles
|
|
GROUP BY id, corpora, `set`
|
|
HAVING COUNT(*) > 1
|
|
) as dups
|
|
""")
|
|
|
|
remaining_dups = cursor.fetchone()[0]
|
|
if remaining_dups > 0:
|
|
print(f"\n⚠ Warning: {remaining_dups} duplicate sets still remain")
|
|
else:
|
|
print(f"\n✓ No duplicates remaining!")
|
|
|
|
# Show final counts
|
|
cursor.execute("SELECT COUNT(*) FROM arc_puzzles")
|
|
final_count = cursor.fetchone()[0]
|
|
print(f"\nFinal table size: {final_count} rows")
|
|
|
|
cursor.execute("""
|
|
SELECT corpora, COUNT(*) as count
|
|
FROM arc_puzzles
|
|
GROUP BY corpora
|
|
ORDER BY corpora
|
|
""")
|
|
print("\nBy corpora:")
|
|
for corpora, count in cursor.fetchall():
|
|
print(f" {corpora}: {count}")
|
|
|
|
except Exception as e:
|
|
connection.rollback()
|
|
print(f"\n✗ Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
finally:
|
|
connection.close()
|
|
print(f"\n✓ Database connection closed")
|
|
|
|
return 0
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|