ConceptARC db upload

This commit is contained in:
bmachado
2025-11-05 13:28:23 +00:00
parent aec95ed954
commit fa5f12fd47
10 changed files with 1061 additions and 1 deletions

View File

@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""
Upload ConceptArc puzzle data to MariaDB database
Reads credentials from .env file and inserts all ConceptArc JSON files
"""
import json
import os
import sys
from pathlib import Path
import pymysql
from dotenv import load_dotenv
# Force unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
config = {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
return config
def get_conceptarc_files():
"""Get all JSON files from ConceptArc subdirectories"""
base_dir = Path('data/ConceptArc')
if not base_dir.exists():
print(f"✗ ConceptArc directory not found: {base_dir}")
return []
# Get all subdirectories (concept categories)
categories = [d for d in base_dir.iterdir() if d.is_dir()]
all_files = []
for category in sorted(categories):
json_files = sorted(category.glob('*.json'))
all_files.extend(json_files)
return all_files
def insert_puzzle(cursor, file_path):
"""Insert a single ConceptArc puzzle into the database"""
# Extract puzzle ID from filename (e.g., "Count1.json" -> "Count1")
puzzle_id = file_path.stem
# Extract category from parent directory (e.g., "Count", "Center", etc.)
category = file_path.parent.name
# Read JSON content
with open(file_path, 'r') as f:
json_content = f.read().strip()
# Verify JSON is valid and extract solution
try:
puzzle_data = json.loads(json_content)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in {file_path}: {e}")
# Extract all solutions from test cases
# ConceptArc has multiple test cases, each with their own output
solution = None
if 'test' in puzzle_data and len(puzzle_data['test']) > 0:
# Collect all test outputs into an array
all_outputs = []
for test_case in puzzle_data['test']:
if 'output' in test_case:
all_outputs.append(test_case['output'])
# Store as JSON array if there are any outputs
if all_outputs:
solution = json.dumps(all_outputs)
# Insert into arc_puzzles table
# id, corpora, set, difficulty, version
# Note: `set` is a reserved keyword, so we escape it with backticks
sql_puzzles = """
INSERT INTO arc_puzzles (id, corpora, `set`, difficulty, version)
VALUES (%s, %s, %s, %s, %s)
"""
cursor.execute(sql_puzzles, (puzzle_id, 'ConceptArc', category, None, None))
# Insert into arc_jsons table
# arc_puzzle_id, json, solution
sql_jsons = """
INSERT INTO arc_jsons (arc_puzzle_id, json, solution)
VALUES (%s, %s, %s)
"""
cursor.execute(sql_jsons, (puzzle_id, json_content, solution))
return puzzle_id, category
def main():
print("ConceptArc Data Upload to MariaDB")
print("=" * 50)
# Load configuration
try:
config = load_env_config()
print(f"✓ Loaded configuration from .env")
print(f" Host: {config['host']}")
print(f" Database: {config['database']}")
print(f" User: {config['user']}")
except Exception as e:
print(f"✗ Error loading configuration: {e}")
return 1
# Get all ConceptArc files
all_files = get_conceptarc_files()
if not all_files:
print(f"✗ No ConceptArc JSON files found")
return 1
print(f"✓ Found {len(all_files)} ConceptArc JSON files")
# Count files per category
from collections import defaultdict
category_counts = defaultdict(int)
for file_path in all_files:
category_counts[file_path.parent.name] += 1
print(f"\nBreakdown by category:")
for category, count in sorted(category_counts.items()):
print(f" - {category}: {count} files")
# Connect to database
try:
print(f"\nConnecting to database...")
connection = pymysql.connect(**config)
print(f"✓ Connected successfully")
except Exception as e:
print(f"✗ Database connection failed: {e}")
return 1
try:
cursor = connection.cursor()
# Check if tables exist
cursor.execute("SHOW TABLES LIKE 'arc_puzzles'")
if not cursor.fetchone():
print(f"✗ Table 'arc_puzzles' does not exist")
return 1
cursor.execute("SHOW TABLES LIKE 'arc_jsons'")
if not cursor.fetchone():
print(f"✗ Table 'arc_jsons' does not exist")
return 1
# Get current counts
cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'ConceptArc'")
initial_puzzles_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM arc_jsons WHERE arc_puzzle_id LIKE '%Count%' OR arc_puzzle_id LIKE '%Center%'")
initial_jsons_count = cursor.fetchone()[0]
print(f"✓ Tables exist")
print(f" Current ConceptArc puzzles in arc_puzzles: {initial_puzzles_count}")
print(f" Current ConceptArc-like entries in arc_jsons: {initial_jsons_count}")
# Ask for confirmation (unless --yes flag is provided)
if '--yes' not in sys.argv:
print(f"\n⚠ About to insert {len(all_files)} ConceptArc records")
response = input("Continue? (yes/no): ").strip().lower()
if response not in ['yes', 'y']:
print("Upload cancelled")
return 0
else:
print(f"\n⚠ About to insert {len(all_files)} records (auto-confirmed with --yes flag)")
print(f"\nInserting records...")
inserted = 0
errors = 0
category_inserted = defaultdict(int)
for i, file_path in enumerate(all_files, 1):
try:
puzzle_id, category = insert_puzzle(cursor, file_path)
inserted += 1
category_inserted[category] += 1
# Show progress every 20 records
if i % 20 == 0 or i == len(all_files):
print(f" Progress: {i}/{len(all_files)} ({inserted} inserted, {errors} errors)")
except pymysql.IntegrityError as e:
# Likely duplicate key
if "Duplicate entry" in str(e):
errors += 1
if errors <= 5: # Only show first 5 errors
print(f" ⚠ Duplicate: {file_path.stem} ({file_path.parent.name})")
else:
raise
except Exception as e:
errors += 1
print(f" ✗ Error with {file_path.stem}: {e}")
if errors > 10:
print(f" Too many errors, stopping...")
break
# Commit the transaction
connection.commit()
# Get final counts
cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'ConceptArc'")
final_puzzles_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM arc_jsons")
final_jsons_count = cursor.fetchone()[0]
print(f"\n{'=' * 50}")
print(f"✓ Upload complete!")
print(f" Successfully inserted: {inserted}")
print(f" Errors/duplicates: {errors}")
print(f" ConceptArc puzzles: {initial_puzzles_count}{final_puzzles_count} (+{final_puzzles_count - initial_puzzles_count})")
print(f"\nInserted by category:")
for category, count in sorted(category_inserted.items()):
print(f" - {category}: {count} puzzles")
except Exception as e:
connection.rollback()
print(f"\n✗ Error during upload: {e}")
import traceback
traceback.print_exc()
return 1
finally:
connection.close()
print(f"\n✓ Database connection closed")
return 0
if __name__ == '__main__':
sys.exit(main())