#!/usr/bin/env python3 """ Upload MiniARC puzzle data to MariaDB database Reads credentials from .env file and inserts all MiniARC JSON files File naming convention: {comment}_{id}.json - id: last part after underscore (e.g., l6aejqqqc1b47pjr5g4) - comment: everything before the id (e.g., "1_3_5") """ import json import os import sys import re from pathlib import Path import pymysql from dotenv import load_dotenv # Force unbuffered output sys.stdout.reconfigure(line_buffering=True) sys.stderr.reconfigure(line_buffering=True) def load_env_config(): """Load database configuration from .env file""" load_dotenv() config = { 'host': os.getenv('DB_HOST'), 'user': os.getenv('DB_USER'), 'password': os.getenv('DB_PASSWORD'), 'database': os.getenv('DB_NAME'), 'port': int(os.getenv('DB_PORT', 3306)), 'charset': 'utf8mb4' } return config def parse_filename(filename): """ Parse MiniARC filename to extract id and comment. Filename format: {comment}_{id}.json The id is the last part starting with 'l6' or similar pattern. Examples: - "1_3_5_l6aejqqqc1b47pjr5g4.json" -> id="l6aejqqqc1b47pjr5g4", comment="1_3_5" - "Good_axis_l69ctqaoulgvm1zso2.json" -> id="l69ctqaoulgvm1zso2", comment="Good_axis" """ stem = filename.replace('.json', '') # The ID pattern appears to be: l6[a-z0-9]+ or l69[a-z0-9]+ # Find the last underscore followed by 'l6' pattern match = re.search(r'^(.+)_(l6[a-z0-9]+)$', stem) if match: comment = match.group(1) puzzle_id = match.group(2) return puzzle_id, comment else: # Fallback: use the whole stem as id return stem, None def get_miniarc_files(): """Get all JSON files from MiniARC directory""" base_dir = Path('data/MiniARC') if not base_dir.exists(): print(f"✗ MiniARC directory not found: {base_dir}") return [] json_files = sorted(base_dir.glob('*.json')) return json_files def insert_puzzle(cursor, file_path): """Insert a single MiniARC puzzle into the database""" # Parse filename to extract puzzle ID and comment puzzle_id, comment = parse_filename(file_path.name) # Read JSON content with open(file_path, 'r') as f: json_content = f.read().strip() # Verify JSON is valid and extract solution try: puzzle_data = json.loads(json_content) except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON in {file_path}: {e}") # Extract all solutions from test cases solution = None if 'test' in puzzle_data and len(puzzle_data['test']) > 0: all_outputs = [] for test_case in puzzle_data['test']: if 'output' in test_case: all_outputs.append(test_case['output']) if all_outputs: solution = json.dumps(all_outputs) # Insert into arc_puzzles table # id, corpora, set, difficulty, comment sql_puzzles = """ INSERT INTO arc_puzzles (id, corpora, `set`, difficulty, comment) VALUES (%s, %s, %s, %s, %s) """ cursor.execute(sql_puzzles, (puzzle_id, 'MiniARC', None, None, comment)) # Insert into arc_jsons table sql_jsons = """ INSERT INTO arc_jsons (arc_puzzle_id, json, solution) VALUES (%s, %s, %s) """ cursor.execute(sql_jsons, (puzzle_id, json_content, solution)) return puzzle_id, comment def preview_data(files, limit=10): """Preview the parsed data without inserting into database""" print(f"\n{'=' * 80}") print(f"PREVIEW: First {limit} puzzles to be inserted") print(f"{'=' * 80}") for i, file_path in enumerate(files[:limit]): puzzle_id, comment = parse_filename(file_path.name) # Read and parse JSON with open(file_path, 'r') as f: puzzle_data = json.load(f) num_train = len(puzzle_data.get('train', [])) num_test = len(puzzle_data.get('test', [])) # Get grid sizes from first training example if puzzle_data.get('train'): first_train = puzzle_data['train'][0] input_shape = f"{len(first_train['input'])}x{len(first_train['input'][0])}" else: input_shape = "N/A" print(f"\n{i+1}. File: {file_path.name}") print(f" ID: {puzzle_id}") print(f" Comment: {comment}") print(f" Train examples: {num_train}, Test examples: {num_test}") print(f" Grid size (first train): {input_shape}") print(f"\n{'=' * 80}") print(f"Total files to insert: {len(files)}") print(f"{'=' * 80}") def main(): print("MiniARC Data Upload to MariaDB") print("=" * 50) # Check for preview mode preview_mode = '--preview' in sys.argv # Load configuration try: config = load_env_config() print(f"✓ Loaded configuration from .env") print(f" Host: {config['host']}") print(f" Database: {config['database']}") print(f" User: {config['user']}") except Exception as e: print(f"✗ Error loading configuration: {e}") return 1 # Get all MiniARC files all_files = get_miniarc_files() if not all_files: print(f"✗ No MiniARC JSON files found") return 1 print(f"✓ Found {len(all_files)} MiniARC JSON files") # Show preview preview_data(all_files) if preview_mode: print("\n[Preview mode - no data inserted]") return 0 # Connect to database try: print(f"\nConnecting to database...") connection = pymysql.connect(**config) print(f"✓ Connected successfully") except Exception as e: print(f"✗ Database connection failed: {e}") return 1 try: cursor = connection.cursor() # Check if tables exist cursor.execute("SHOW TABLES LIKE 'arc_puzzles'") if not cursor.fetchone(): print(f"✗ Table 'arc_puzzles' does not exist") return 1 cursor.execute("SHOW TABLES LIKE 'arc_jsons'") if not cursor.fetchone(): print(f"✗ Table 'arc_jsons' does not exist") return 1 # Check if 'comment' column exists in arc_puzzles cursor.execute("SHOW COLUMNS FROM arc_puzzles LIKE 'comment'") if not cursor.fetchone(): print(f"\n⚠ Column 'comment' does not exist in arc_puzzles table") print(f" Adding 'comment' column...") cursor.execute("ALTER TABLE arc_puzzles ADD COLUMN comment VARCHAR(255) DEFAULT NULL") connection.commit() print(f"✓ Column 'comment' added successfully") else: print(f"✓ Column 'comment' exists in arc_puzzles table") # Get current counts cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'MiniARC'") initial_puzzles_count = cursor.fetchone()[0] print(f"✓ Tables exist") print(f" Current MiniARC puzzles in arc_puzzles: {initial_puzzles_count}") # Ask for confirmation (unless --yes flag is provided) if '--yes' not in sys.argv: print(f"\n⚠ About to insert {len(all_files)} MiniARC records") response = input("Continue? (yes/no): ").strip().lower() if response not in ['yes', 'y']: print("Upload cancelled") return 0 else: print(f"\n⚠ About to insert {len(all_files)} records (auto-confirmed with --yes flag)") print(f"\nInserting records...") inserted = 0 errors = 0 for i, file_path in enumerate(all_files, 1): try: puzzle_id, comment = insert_puzzle(cursor, file_path) inserted += 1 # Show progress every 20 records if i % 20 == 0 or i == len(all_files): print(f" Progress: {i}/{len(all_files)} ({inserted} inserted, {errors} errors)") except pymysql.IntegrityError as e: # Likely duplicate key if "Duplicate entry" in str(e): errors += 1 if errors <= 5: # Only show first 5 errors print(f" ⚠ Duplicate: {file_path.name}") else: raise except Exception as e: errors += 1 print(f" ✗ Error with {file_path.name}: {e}") if errors > 10: print(f" Too many errors, stopping...") break # Commit the transaction connection.commit() # Get final counts cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'MiniARC'") final_puzzles_count = cursor.fetchone()[0] print(f"\n{'=' * 50}") print(f"✓ Upload complete!") print(f" Successfully inserted: {inserted}") print(f" Errors/duplicates: {errors}") print(f" MiniARC puzzles: {initial_puzzles_count} → {final_puzzles_count} (+{final_puzzles_count - initial_puzzles_count})") except Exception as e: connection.rollback() print(f"\n✗ Error during upload: {e}") import traceback traceback.print_exc() return 1 finally: connection.close() print(f"\n✓ Database connection closed") return 0 if __name__ == '__main__': sys.exit(main())