#!/usr/bin/env python3 """ Upload ConceptArc puzzle data to MariaDB database Reads credentials from .env file and inserts all ConceptArc JSON files """ import json import os import sys from pathlib import Path import pymysql from dotenv import load_dotenv # Force unbuffered output sys.stdout.reconfigure(line_buffering=True) sys.stderr.reconfigure(line_buffering=True) def load_env_config(): """Load database configuration from .env file""" load_dotenv() config = { 'host': os.getenv('DB_HOST'), 'user': os.getenv('DB_USER'), 'password': os.getenv('DB_PASSWORD'), 'database': os.getenv('DB_NAME'), 'port': int(os.getenv('DB_PORT', 3306)), 'charset': 'utf8mb4' } return config def get_conceptarc_files(): """Get all JSON files from ConceptArc subdirectories""" base_dir = Path('data/ConceptArc') if not base_dir.exists(): print(f"✗ ConceptArc directory not found: {base_dir}") return [] # Get all subdirectories (concept categories) categories = [d for d in base_dir.iterdir() if d.is_dir()] all_files = [] for category in sorted(categories): json_files = sorted(category.glob('*.json')) all_files.extend(json_files) return all_files def insert_puzzle(cursor, file_path): """Insert a single ConceptArc puzzle into the database""" # Extract puzzle ID from filename (e.g., "Count1.json" -> "Count1") puzzle_id = file_path.stem # Extract category from parent directory (e.g., "Count", "Center", etc.) category = file_path.parent.name # Read JSON content with open(file_path, 'r') as f: json_content = f.read().strip() # Verify JSON is valid and extract solution try: puzzle_data = json.loads(json_content) except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON in {file_path}: {e}") # Extract all solutions from test cases # ConceptArc has multiple test cases, each with their own output solution = None if 'test' in puzzle_data and len(puzzle_data['test']) > 0: # Collect all test outputs into an array all_outputs = [] for test_case in puzzle_data['test']: if 'output' in test_case: all_outputs.append(test_case['output']) # Store as JSON array if there are any outputs if all_outputs: solution = json.dumps(all_outputs) # Insert into arc_puzzles table # id, corpora, set, difficulty, version # Note: `set` is a reserved keyword, so we escape it with backticks sql_puzzles = """ INSERT INTO arc_puzzles (id, corpora, `set`, difficulty, version) VALUES (%s, %s, %s, %s, %s) """ cursor.execute(sql_puzzles, (puzzle_id, 'ConceptArc', category, None, None)) # Insert into arc_jsons table # arc_puzzle_id, json, solution sql_jsons = """ INSERT INTO arc_jsons (arc_puzzle_id, json, solution) VALUES (%s, %s, %s) """ cursor.execute(sql_jsons, (puzzle_id, json_content, solution)) return puzzle_id, category def main(): print("ConceptArc Data Upload to MariaDB") print("=" * 50) # Load configuration try: config = load_env_config() print(f"✓ Loaded configuration from .env") print(f" Host: {config['host']}") print(f" Database: {config['database']}") print(f" User: {config['user']}") except Exception as e: print(f"✗ Error loading configuration: {e}") return 1 # Get all ConceptArc files all_files = get_conceptarc_files() if not all_files: print(f"✗ No ConceptArc JSON files found") return 1 print(f"✓ Found {len(all_files)} ConceptArc JSON files") # Count files per category from collections import defaultdict category_counts = defaultdict(int) for file_path in all_files: category_counts[file_path.parent.name] += 1 print(f"\nBreakdown by category:") for category, count in sorted(category_counts.items()): print(f" - {category}: {count} files") # Connect to database try: print(f"\nConnecting to database...") connection = pymysql.connect(**config) print(f"✓ Connected successfully") except Exception as e: print(f"✗ Database connection failed: {e}") return 1 try: cursor = connection.cursor() # Check if tables exist cursor.execute("SHOW TABLES LIKE 'arc_puzzles'") if not cursor.fetchone(): print(f"✗ Table 'arc_puzzles' does not exist") return 1 cursor.execute("SHOW TABLES LIKE 'arc_jsons'") if not cursor.fetchone(): print(f"✗ Table 'arc_jsons' does not exist") return 1 # Get current counts cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'ConceptArc'") initial_puzzles_count = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM arc_jsons WHERE arc_puzzle_id LIKE '%Count%' OR arc_puzzle_id LIKE '%Center%'") initial_jsons_count = cursor.fetchone()[0] print(f"✓ Tables exist") print(f" Current ConceptArc puzzles in arc_puzzles: {initial_puzzles_count}") print(f" Current ConceptArc-like entries in arc_jsons: {initial_jsons_count}") # Ask for confirmation (unless --yes flag is provided) if '--yes' not in sys.argv: print(f"\n⚠ About to insert {len(all_files)} ConceptArc records") response = input("Continue? (yes/no): ").strip().lower() if response not in ['yes', 'y']: print("Upload cancelled") return 0 else: print(f"\n⚠ About to insert {len(all_files)} records (auto-confirmed with --yes flag)") print(f"\nInserting records...") inserted = 0 errors = 0 category_inserted = defaultdict(int) for i, file_path in enumerate(all_files, 1): try: puzzle_id, category = insert_puzzle(cursor, file_path) inserted += 1 category_inserted[category] += 1 # Show progress every 20 records if i % 20 == 0 or i == len(all_files): print(f" Progress: {i}/{len(all_files)} ({inserted} inserted, {errors} errors)") except pymysql.IntegrityError as e: # Likely duplicate key if "Duplicate entry" in str(e): errors += 1 if errors <= 5: # Only show first 5 errors print(f" ⚠ Duplicate: {file_path.stem} ({file_path.parent.name})") else: raise except Exception as e: errors += 1 print(f" ✗ Error with {file_path.stem}: {e}") if errors > 10: print(f" Too many errors, stopping...") break # Commit the transaction connection.commit() # Get final counts cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'ConceptArc'") final_puzzles_count = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM arc_jsons") final_jsons_count = cursor.fetchone()[0] print(f"\n{'=' * 50}") print(f"✓ Upload complete!") print(f" Successfully inserted: {inserted}") print(f" Errors/duplicates: {errors}") print(f" ConceptArc puzzles: {initial_puzzles_count} → {final_puzzles_count} (+{final_puzzles_count - initial_puzzles_count})") print(f"\nInserted by category:") for category, count in sorted(category_inserted.items()): print(f" - {category}: {count} puzzles") except Exception as e: connection.rollback() print(f"\n✗ Error during upload: {e}") import traceback traceback.print_exc() return 1 finally: connection.close() print(f"\n✓ Database connection closed") return 0 if __name__ == '__main__': sys.exit(main())