ConceptARC db upload
This commit is contained in:
82
scripts/find_duplicates.py
Normal file
82
scripts/find_duplicates.py
Normal file
@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Find duplicate puzzle IDs in arc_puzzles table
|
||||
"""
|
||||
|
||||
import os
|
||||
import pymysql
|
||||
from dotenv import load_dotenv
|
||||
|
||||
def load_env_config():
|
||||
"""Load database configuration from .env file"""
|
||||
load_dotenv()
|
||||
return {
|
||||
'host': os.getenv('DB_HOST'),
|
||||
'user': os.getenv('DB_USER'),
|
||||
'password': os.getenv('DB_PASSWORD'),
|
||||
'database': os.getenv('DB_NAME'),
|
||||
'port': int(os.getenv('DB_PORT', 3306)),
|
||||
'charset': 'utf8mb4'
|
||||
}
|
||||
|
||||
def main():
|
||||
print("Finding Duplicate IDs in arc_puzzles")
|
||||
print("=" * 50)
|
||||
|
||||
config = load_env_config()
|
||||
connection = pymysql.connect(**config)
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Find all duplicate IDs
|
||||
cursor.execute("""
|
||||
SELECT id, corpora, `set`, COUNT(*) as count
|
||||
FROM arc_puzzles
|
||||
GROUP BY id, corpora, `set`
|
||||
HAVING COUNT(*) > 1
|
||||
ORDER BY count DESC, id
|
||||
""")
|
||||
|
||||
duplicates = cursor.fetchall()
|
||||
|
||||
if duplicates:
|
||||
print(f"\n⚠ Found {len(duplicates)} duplicate entries:")
|
||||
print(f"{'ID':<20} {'Corpora':<15} {'Set':<20} {'Count':<10}")
|
||||
print("-" * 70)
|
||||
for puzzle_id, corpora, set_name, count in duplicates:
|
||||
print(f"{puzzle_id:<20} {corpora:<15} {set_name:<20} {count:<10}")
|
||||
|
||||
# Calculate totals
|
||||
total_duplicates = sum(count - 1 for _, _, _, count in duplicates)
|
||||
print(f"\nTotal duplicate rows to remove: {total_duplicates}")
|
||||
|
||||
# Check ConceptArc specifically
|
||||
conceptarc_dups = [d for d in duplicates if d[1] == 'ConceptArc']
|
||||
if conceptarc_dups:
|
||||
print(f"\nConceptArc duplicates: {len(conceptarc_dups)} unique IDs")
|
||||
conceptarc_dup_count = sum(count - 1 for _, _, _, count in conceptarc_dups)
|
||||
print(f"ConceptArc duplicate rows to remove: {conceptarc_dup_count}")
|
||||
else:
|
||||
print("\n✓ No duplicates found!")
|
||||
|
||||
# Show total counts
|
||||
print("\n" + "=" * 50)
|
||||
print("Current table statistics:")
|
||||
print("=" * 50)
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_puzzles")
|
||||
total = cursor.fetchone()[0]
|
||||
print(f"Total rows in arc_puzzles: {total}")
|
||||
|
||||
cursor.execute("""
|
||||
SELECT corpora, COUNT(*) as count
|
||||
FROM arc_puzzles
|
||||
GROUP BY corpora
|
||||
ORDER BY corpora
|
||||
""")
|
||||
print("\nBy corpora:")
|
||||
for corpora, count in cursor.fetchall():
|
||||
print(f" {corpora}: {count}")
|
||||
|
||||
connection.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user