ConceptARC db upload
This commit is contained in:
240
scripts/upload_conceptarc_to_db.py
Executable file
240
scripts/upload_conceptarc_to_db.py
Executable file
@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Upload ConceptArc puzzle data to MariaDB database
|
||||
Reads credentials from .env file and inserts all ConceptArc JSON files
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import pymysql
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Force unbuffered output
|
||||
sys.stdout.reconfigure(line_buffering=True)
|
||||
sys.stderr.reconfigure(line_buffering=True)
|
||||
|
||||
def load_env_config():
|
||||
"""Load database configuration from .env file"""
|
||||
load_dotenv()
|
||||
|
||||
config = {
|
||||
'host': os.getenv('DB_HOST'),
|
||||
'user': os.getenv('DB_USER'),
|
||||
'password': os.getenv('DB_PASSWORD'),
|
||||
'database': os.getenv('DB_NAME'),
|
||||
'port': int(os.getenv('DB_PORT', 3306)),
|
||||
'charset': 'utf8mb4'
|
||||
}
|
||||
|
||||
return config
|
||||
|
||||
def get_conceptarc_files():
|
||||
"""Get all JSON files from ConceptArc subdirectories"""
|
||||
base_dir = Path('data/ConceptArc')
|
||||
|
||||
if not base_dir.exists():
|
||||
print(f"✗ ConceptArc directory not found: {base_dir}")
|
||||
return []
|
||||
|
||||
# Get all subdirectories (concept categories)
|
||||
categories = [d for d in base_dir.iterdir() if d.is_dir()]
|
||||
|
||||
all_files = []
|
||||
for category in sorted(categories):
|
||||
json_files = sorted(category.glob('*.json'))
|
||||
all_files.extend(json_files)
|
||||
|
||||
return all_files
|
||||
|
||||
def insert_puzzle(cursor, file_path):
|
||||
"""Insert a single ConceptArc puzzle into the database"""
|
||||
# Extract puzzle ID from filename (e.g., "Count1.json" -> "Count1")
|
||||
puzzle_id = file_path.stem
|
||||
|
||||
# Extract category from parent directory (e.g., "Count", "Center", etc.)
|
||||
category = file_path.parent.name
|
||||
|
||||
# Read JSON content
|
||||
with open(file_path, 'r') as f:
|
||||
json_content = f.read().strip()
|
||||
|
||||
# Verify JSON is valid and extract solution
|
||||
try:
|
||||
puzzle_data = json.loads(json_content)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON in {file_path}: {e}")
|
||||
|
||||
# Extract all solutions from test cases
|
||||
# ConceptArc has multiple test cases, each with their own output
|
||||
solution = None
|
||||
if 'test' in puzzle_data and len(puzzle_data['test']) > 0:
|
||||
# Collect all test outputs into an array
|
||||
all_outputs = []
|
||||
for test_case in puzzle_data['test']:
|
||||
if 'output' in test_case:
|
||||
all_outputs.append(test_case['output'])
|
||||
|
||||
# Store as JSON array if there are any outputs
|
||||
if all_outputs:
|
||||
solution = json.dumps(all_outputs)
|
||||
|
||||
# Insert into arc_puzzles table
|
||||
# id, corpora, set, difficulty, version
|
||||
# Note: `set` is a reserved keyword, so we escape it with backticks
|
||||
sql_puzzles = """
|
||||
INSERT INTO arc_puzzles (id, corpora, `set`, difficulty, version)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
"""
|
||||
cursor.execute(sql_puzzles, (puzzle_id, 'ConceptArc', category, None, None))
|
||||
|
||||
# Insert into arc_jsons table
|
||||
# arc_puzzle_id, json, solution
|
||||
sql_jsons = """
|
||||
INSERT INTO arc_jsons (arc_puzzle_id, json, solution)
|
||||
VALUES (%s, %s, %s)
|
||||
"""
|
||||
cursor.execute(sql_jsons, (puzzle_id, json_content, solution))
|
||||
|
||||
return puzzle_id, category
|
||||
|
||||
def main():
|
||||
print("ConceptArc Data Upload to MariaDB")
|
||||
print("=" * 50)
|
||||
|
||||
# Load configuration
|
||||
try:
|
||||
config = load_env_config()
|
||||
print(f"✓ Loaded configuration from .env")
|
||||
print(f" Host: {config['host']}")
|
||||
print(f" Database: {config['database']}")
|
||||
print(f" User: {config['user']}")
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading configuration: {e}")
|
||||
return 1
|
||||
|
||||
# Get all ConceptArc files
|
||||
all_files = get_conceptarc_files()
|
||||
if not all_files:
|
||||
print(f"✗ No ConceptArc JSON files found")
|
||||
return 1
|
||||
|
||||
print(f"✓ Found {len(all_files)} ConceptArc JSON files")
|
||||
|
||||
# Count files per category
|
||||
from collections import defaultdict
|
||||
category_counts = defaultdict(int)
|
||||
for file_path in all_files:
|
||||
category_counts[file_path.parent.name] += 1
|
||||
|
||||
print(f"\nBreakdown by category:")
|
||||
for category, count in sorted(category_counts.items()):
|
||||
print(f" - {category}: {count} files")
|
||||
|
||||
# Connect to database
|
||||
try:
|
||||
print(f"\nConnecting to database...")
|
||||
connection = pymysql.connect(**config)
|
||||
print(f"✓ Connected successfully")
|
||||
except Exception as e:
|
||||
print(f"✗ Database connection failed: {e}")
|
||||
return 1
|
||||
|
||||
try:
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if tables exist
|
||||
cursor.execute("SHOW TABLES LIKE 'arc_puzzles'")
|
||||
if not cursor.fetchone():
|
||||
print(f"✗ Table 'arc_puzzles' does not exist")
|
||||
return 1
|
||||
|
||||
cursor.execute("SHOW TABLES LIKE 'arc_jsons'")
|
||||
if not cursor.fetchone():
|
||||
print(f"✗ Table 'arc_jsons' does not exist")
|
||||
return 1
|
||||
|
||||
# Get current counts
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'ConceptArc'")
|
||||
initial_puzzles_count = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_jsons WHERE arc_puzzle_id LIKE '%Count%' OR arc_puzzle_id LIKE '%Center%'")
|
||||
initial_jsons_count = cursor.fetchone()[0]
|
||||
|
||||
print(f"✓ Tables exist")
|
||||
print(f" Current ConceptArc puzzles in arc_puzzles: {initial_puzzles_count}")
|
||||
print(f" Current ConceptArc-like entries in arc_jsons: {initial_jsons_count}")
|
||||
|
||||
# Ask for confirmation (unless --yes flag is provided)
|
||||
if '--yes' not in sys.argv:
|
||||
print(f"\n⚠ About to insert {len(all_files)} ConceptArc records")
|
||||
response = input("Continue? (yes/no): ").strip().lower()
|
||||
if response not in ['yes', 'y']:
|
||||
print("Upload cancelled")
|
||||
return 0
|
||||
else:
|
||||
print(f"\n⚠ About to insert {len(all_files)} records (auto-confirmed with --yes flag)")
|
||||
|
||||
print(f"\nInserting records...")
|
||||
inserted = 0
|
||||
errors = 0
|
||||
category_inserted = defaultdict(int)
|
||||
|
||||
for i, file_path in enumerate(all_files, 1):
|
||||
try:
|
||||
puzzle_id, category = insert_puzzle(cursor, file_path)
|
||||
inserted += 1
|
||||
category_inserted[category] += 1
|
||||
|
||||
# Show progress every 20 records
|
||||
if i % 20 == 0 or i == len(all_files):
|
||||
print(f" Progress: {i}/{len(all_files)} ({inserted} inserted, {errors} errors)")
|
||||
|
||||
except pymysql.IntegrityError as e:
|
||||
# Likely duplicate key
|
||||
if "Duplicate entry" in str(e):
|
||||
errors += 1
|
||||
if errors <= 5: # Only show first 5 errors
|
||||
print(f" ⚠ Duplicate: {file_path.stem} ({file_path.parent.name})")
|
||||
else:
|
||||
raise
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
print(f" ✗ Error with {file_path.stem}: {e}")
|
||||
if errors > 10:
|
||||
print(f" Too many errors, stopping...")
|
||||
break
|
||||
|
||||
# Commit the transaction
|
||||
connection.commit()
|
||||
|
||||
# Get final counts
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'ConceptArc'")
|
||||
final_puzzles_count = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_jsons")
|
||||
final_jsons_count = cursor.fetchone()[0]
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"✓ Upload complete!")
|
||||
print(f" Successfully inserted: {inserted}")
|
||||
print(f" Errors/duplicates: {errors}")
|
||||
print(f" ConceptArc puzzles: {initial_puzzles_count} → {final_puzzles_count} (+{final_puzzles_count - initial_puzzles_count})")
|
||||
|
||||
print(f"\nInserted by category:")
|
||||
for category, count in sorted(category_inserted.items()):
|
||||
print(f" - {category}: {count} puzzles")
|
||||
|
||||
except Exception as e:
|
||||
connection.rollback()
|
||||
print(f"\n✗ Error during upload: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
finally:
|
||||
connection.close()
|
||||
print(f"\n✓ Database connection closed")
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user