ConceptARC db upload
This commit is contained in:
146
scripts/remove_duplicates.py
Executable file
146
scripts/remove_duplicates.py
Executable file
@ -0,0 +1,146 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Remove duplicate entries from arc_puzzles table
|
||||
Keeps only the first occurrence of each unique (id, corpora, set) combination
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pymysql
|
||||
from dotenv import load_dotenv
|
||||
|
||||
def load_env_config():
|
||||
"""Load database configuration from .env file"""
|
||||
load_dotenv()
|
||||
return {
|
||||
'host': os.getenv('DB_HOST'),
|
||||
'user': os.getenv('DB_USER'),
|
||||
'password': os.getenv('DB_PASSWORD'),
|
||||
'database': os.getenv('DB_NAME'),
|
||||
'port': int(os.getenv('DB_PORT', 3306)),
|
||||
'charset': 'utf8mb4'
|
||||
}
|
||||
|
||||
def main():
|
||||
print("Removing Duplicates from arc_puzzles Table")
|
||||
print("=" * 50)
|
||||
|
||||
config = load_env_config()
|
||||
connection = pymysql.connect(**config)
|
||||
cursor = connection.cursor()
|
||||
|
||||
try:
|
||||
# First, check if arc_puzzles has a primary key or unique identifier
|
||||
cursor.execute("DESCRIBE arc_puzzles")
|
||||
columns = cursor.fetchall()
|
||||
print("\nTable structure:")
|
||||
for col in columns:
|
||||
print(f" {col[0]:<20} {col[1]:<20} Key:{col[3]}")
|
||||
|
||||
# Find duplicates
|
||||
cursor.execute("""
|
||||
SELECT id, corpora, `set`, COUNT(*) as count
|
||||
FROM arc_puzzles
|
||||
GROUP BY id, corpora, `set`
|
||||
HAVING COUNT(*) > 1
|
||||
ORDER BY id
|
||||
""")
|
||||
|
||||
duplicates = cursor.fetchall()
|
||||
|
||||
if not duplicates:
|
||||
print("\n✓ No duplicates found!")
|
||||
return 0
|
||||
|
||||
print(f"\nFound {len(duplicates)} sets of duplicates")
|
||||
total_to_remove = sum(count - 1 for _, _, _, count in duplicates)
|
||||
print(f"Total rows to remove: {total_to_remove}")
|
||||
|
||||
# Ask for confirmation
|
||||
if '--yes' not in sys.argv:
|
||||
print(f"\n⚠ This will delete {total_to_remove} duplicate rows")
|
||||
response = input("Continue? (yes/no): ").strip().lower()
|
||||
if response not in ['yes', 'y']:
|
||||
print("Operation cancelled")
|
||||
return 0
|
||||
else:
|
||||
print(f"\n⚠ Deleting {total_to_remove} duplicate rows (auto-confirmed with --yes flag)")
|
||||
|
||||
# For each duplicate set, keep only one and delete the rest
|
||||
# Since there's no auto-increment primary key, we'll use a different approach
|
||||
# We'll create a temporary table with unique entries, then replace the original
|
||||
|
||||
print("\nRemoving duplicates...")
|
||||
removed_count = 0
|
||||
|
||||
for puzzle_id, corpora, set_name, count in duplicates:
|
||||
if count > 1:
|
||||
# Keep one, delete the extras
|
||||
# We delete (count - 1) duplicates
|
||||
delete_count = count - 1
|
||||
|
||||
# Delete using LIMIT to remove only the extra copies
|
||||
cursor.execute("""
|
||||
DELETE FROM arc_puzzles
|
||||
WHERE id = %s AND corpora = %s AND `set` = %s
|
||||
LIMIT %s
|
||||
""", (puzzle_id, corpora, set_name, delete_count))
|
||||
|
||||
removed_count += cursor.rowcount
|
||||
|
||||
if removed_count % 50 == 0:
|
||||
print(f" Removed {removed_count}/{total_to_remove} duplicates...")
|
||||
|
||||
# Commit the changes
|
||||
connection.commit()
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"✓ Duplicate removal complete!")
|
||||
print(f" Total duplicates removed: {removed_count}")
|
||||
|
||||
# Verify no duplicates remain
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*)
|
||||
FROM (
|
||||
SELECT id, corpora, `set`, COUNT(*) as count
|
||||
FROM arc_puzzles
|
||||
GROUP BY id, corpora, `set`
|
||||
HAVING COUNT(*) > 1
|
||||
) as dups
|
||||
""")
|
||||
|
||||
remaining_dups = cursor.fetchone()[0]
|
||||
if remaining_dups > 0:
|
||||
print(f"\n⚠ Warning: {remaining_dups} duplicate sets still remain")
|
||||
else:
|
||||
print(f"\n✓ No duplicates remaining!")
|
||||
|
||||
# Show final counts
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_puzzles")
|
||||
final_count = cursor.fetchone()[0]
|
||||
print(f"\nFinal table size: {final_count} rows")
|
||||
|
||||
cursor.execute("""
|
||||
SELECT corpora, COUNT(*) as count
|
||||
FROM arc_puzzles
|
||||
GROUP BY corpora
|
||||
ORDER BY corpora
|
||||
""")
|
||||
print("\nBy corpora:")
|
||||
for corpora, count in cursor.fetchall():
|
||||
print(f" {corpora}: {count}")
|
||||
|
||||
except Exception as e:
|
||||
connection.rollback()
|
||||
print(f"\n✗ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
finally:
|
||||
connection.close()
|
||||
print(f"\n✓ Database connection closed")
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user