ConceptARC db upload

This commit is contained in:
bmachado
2025-11-05 13:28:23 +00:00
parent aec95ed954
commit fa5f12fd47
10 changed files with 1061 additions and 1 deletions

146
scripts/remove_duplicates.py Executable file
View File

@ -0,0 +1,146 @@
#!/usr/bin/env python3
"""
Remove duplicate entries from arc_puzzles table
Keeps only the first occurrence of each unique (id, corpora, set) combination
"""
import os
import sys
import pymysql
from dotenv import load_dotenv
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
return {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
def main():
print("Removing Duplicates from arc_puzzles Table")
print("=" * 50)
config = load_env_config()
connection = pymysql.connect(**config)
cursor = connection.cursor()
try:
# First, check if arc_puzzles has a primary key or unique identifier
cursor.execute("DESCRIBE arc_puzzles")
columns = cursor.fetchall()
print("\nTable structure:")
for col in columns:
print(f" {col[0]:<20} {col[1]:<20} Key:{col[3]}")
# Find duplicates
cursor.execute("""
SELECT id, corpora, `set`, COUNT(*) as count
FROM arc_puzzles
GROUP BY id, corpora, `set`
HAVING COUNT(*) > 1
ORDER BY id
""")
duplicates = cursor.fetchall()
if not duplicates:
print("\n✓ No duplicates found!")
return 0
print(f"\nFound {len(duplicates)} sets of duplicates")
total_to_remove = sum(count - 1 for _, _, _, count in duplicates)
print(f"Total rows to remove: {total_to_remove}")
# Ask for confirmation
if '--yes' not in sys.argv:
print(f"\n⚠ This will delete {total_to_remove} duplicate rows")
response = input("Continue? (yes/no): ").strip().lower()
if response not in ['yes', 'y']:
print("Operation cancelled")
return 0
else:
print(f"\n⚠ Deleting {total_to_remove} duplicate rows (auto-confirmed with --yes flag)")
# For each duplicate set, keep only one and delete the rest
# Since there's no auto-increment primary key, we'll use a different approach
# We'll create a temporary table with unique entries, then replace the original
print("\nRemoving duplicates...")
removed_count = 0
for puzzle_id, corpora, set_name, count in duplicates:
if count > 1:
# Keep one, delete the extras
# We delete (count - 1) duplicates
delete_count = count - 1
# Delete using LIMIT to remove only the extra copies
cursor.execute("""
DELETE FROM arc_puzzles
WHERE id = %s AND corpora = %s AND `set` = %s
LIMIT %s
""", (puzzle_id, corpora, set_name, delete_count))
removed_count += cursor.rowcount
if removed_count % 50 == 0:
print(f" Removed {removed_count}/{total_to_remove} duplicates...")
# Commit the changes
connection.commit()
print(f"\n{'=' * 50}")
print(f"✓ Duplicate removal complete!")
print(f" Total duplicates removed: {removed_count}")
# Verify no duplicates remain
cursor.execute("""
SELECT COUNT(*)
FROM (
SELECT id, corpora, `set`, COUNT(*) as count
FROM arc_puzzles
GROUP BY id, corpora, `set`
HAVING COUNT(*) > 1
) as dups
""")
remaining_dups = cursor.fetchone()[0]
if remaining_dups > 0:
print(f"\n⚠ Warning: {remaining_dups} duplicate sets still remain")
else:
print(f"\n✓ No duplicates remaining!")
# Show final counts
cursor.execute("SELECT COUNT(*) FROM arc_puzzles")
final_count = cursor.fetchone()[0]
print(f"\nFinal table size: {final_count} rows")
cursor.execute("""
SELECT corpora, COUNT(*) as count
FROM arc_puzzles
GROUP BY corpora
ORDER BY corpora
""")
print("\nBy corpora:")
for corpora, count in cursor.fetchall():
print(f" {corpora}: {count}")
except Exception as e:
connection.rollback()
print(f"\n✗ Error: {e}")
import traceback
traceback.print_exc()
return 1
finally:
connection.close()
print(f"\n✓ Database connection closed")
return 0
if __name__ == '__main__':
sys.exit(main())