289 lines
9.2 KiB
Python
289 lines
9.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Upload MiniARC puzzle data to MariaDB database
|
|
Reads credentials from .env file and inserts all MiniARC JSON files
|
|
|
|
File naming convention: {comment}_{id}.json
|
|
- id: last part after underscore (e.g., l6aejqqqc1b47pjr5g4)
|
|
- comment: everything before the id (e.g., "1_3_5")
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import re
|
|
from pathlib import Path
|
|
import pymysql
|
|
from dotenv import load_dotenv
|
|
|
|
# Force unbuffered output
|
|
sys.stdout.reconfigure(line_buffering=True)
|
|
sys.stderr.reconfigure(line_buffering=True)
|
|
|
|
def load_env_config():
|
|
"""Load database configuration from .env file"""
|
|
load_dotenv()
|
|
|
|
config = {
|
|
'host': os.getenv('DB_HOST'),
|
|
'user': os.getenv('DB_USER'),
|
|
'password': os.getenv('DB_PASSWORD'),
|
|
'database': os.getenv('DB_NAME'),
|
|
'port': int(os.getenv('DB_PORT', 3306)),
|
|
'charset': 'utf8mb4'
|
|
}
|
|
|
|
return config
|
|
|
|
def parse_filename(filename):
|
|
"""
|
|
Parse MiniARC filename to extract id and comment.
|
|
|
|
Filename format: {comment}_{id}.json
|
|
The id is the last part starting with 'l6' or similar pattern.
|
|
|
|
Examples:
|
|
- "1_3_5_l6aejqqqc1b47pjr5g4.json" -> id="l6aejqqqc1b47pjr5g4", comment="1_3_5"
|
|
- "Good_axis_l69ctqaoulgvm1zso2.json" -> id="l69ctqaoulgvm1zso2", comment="Good_axis"
|
|
"""
|
|
stem = filename.replace('.json', '')
|
|
|
|
# The ID pattern appears to be: l6[a-z0-9]+ or l69[a-z0-9]+
|
|
# Find the last underscore followed by 'l6' pattern
|
|
match = re.search(r'^(.+)_(l6[a-z0-9]+)$', stem)
|
|
|
|
if match:
|
|
comment = match.group(1)
|
|
puzzle_id = match.group(2)
|
|
return puzzle_id, comment
|
|
else:
|
|
# Fallback: use the whole stem as id
|
|
return stem, None
|
|
|
|
def get_miniarc_files():
|
|
"""Get all JSON files from MiniARC directory"""
|
|
base_dir = Path('data/MiniARC')
|
|
|
|
if not base_dir.exists():
|
|
print(f"✗ MiniARC directory not found: {base_dir}")
|
|
return []
|
|
|
|
json_files = sorted(base_dir.glob('*.json'))
|
|
return json_files
|
|
|
|
def insert_puzzle(cursor, file_path):
|
|
"""Insert a single MiniARC puzzle into the database"""
|
|
# Parse filename to extract puzzle ID and comment
|
|
puzzle_id, comment = parse_filename(file_path.name)
|
|
|
|
# Read JSON content
|
|
with open(file_path, 'r') as f:
|
|
json_content = f.read().strip()
|
|
|
|
# Verify JSON is valid and extract solution
|
|
try:
|
|
puzzle_data = json.loads(json_content)
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Invalid JSON in {file_path}: {e}")
|
|
|
|
# Extract all solutions from test cases
|
|
solution = None
|
|
if 'test' in puzzle_data and len(puzzle_data['test']) > 0:
|
|
all_outputs = []
|
|
for test_case in puzzle_data['test']:
|
|
if 'output' in test_case:
|
|
all_outputs.append(test_case['output'])
|
|
|
|
if all_outputs:
|
|
solution = json.dumps(all_outputs)
|
|
|
|
# Insert into arc_puzzles table
|
|
# id, corpora, set, difficulty, comment
|
|
sql_puzzles = """
|
|
INSERT INTO arc_puzzles (id, corpora, `set`, difficulty, comment)
|
|
VALUES (%s, %s, %s, %s, %s)
|
|
"""
|
|
cursor.execute(sql_puzzles, (puzzle_id, 'MiniARC', None, None, comment))
|
|
|
|
# Insert into arc_jsons table
|
|
sql_jsons = """
|
|
INSERT INTO arc_jsons (arc_puzzle_id, json, solution)
|
|
VALUES (%s, %s, %s)
|
|
"""
|
|
cursor.execute(sql_jsons, (puzzle_id, json_content, solution))
|
|
|
|
return puzzle_id, comment
|
|
|
|
def preview_data(files, limit=10):
|
|
"""Preview the parsed data without inserting into database"""
|
|
print(f"\n{'=' * 80}")
|
|
print(f"PREVIEW: First {limit} puzzles to be inserted")
|
|
print(f"{'=' * 80}")
|
|
|
|
for i, file_path in enumerate(files[:limit]):
|
|
puzzle_id, comment = parse_filename(file_path.name)
|
|
|
|
# Read and parse JSON
|
|
with open(file_path, 'r') as f:
|
|
puzzle_data = json.load(f)
|
|
|
|
num_train = len(puzzle_data.get('train', []))
|
|
num_test = len(puzzle_data.get('test', []))
|
|
|
|
# Get grid sizes from first training example
|
|
if puzzle_data.get('train'):
|
|
first_train = puzzle_data['train'][0]
|
|
input_shape = f"{len(first_train['input'])}x{len(first_train['input'][0])}"
|
|
else:
|
|
input_shape = "N/A"
|
|
|
|
print(f"\n{i+1}. File: {file_path.name}")
|
|
print(f" ID: {puzzle_id}")
|
|
print(f" Comment: {comment}")
|
|
print(f" Train examples: {num_train}, Test examples: {num_test}")
|
|
print(f" Grid size (first train): {input_shape}")
|
|
|
|
print(f"\n{'=' * 80}")
|
|
print(f"Total files to insert: {len(files)}")
|
|
print(f"{'=' * 80}")
|
|
|
|
def main():
|
|
print("MiniARC Data Upload to MariaDB")
|
|
print("=" * 50)
|
|
|
|
# Check for preview mode
|
|
preview_mode = '--preview' in sys.argv
|
|
|
|
# Load configuration
|
|
try:
|
|
config = load_env_config()
|
|
print(f"✓ Loaded configuration from .env")
|
|
print(f" Host: {config['host']}")
|
|
print(f" Database: {config['database']}")
|
|
print(f" User: {config['user']}")
|
|
except Exception as e:
|
|
print(f"✗ Error loading configuration: {e}")
|
|
return 1
|
|
|
|
# Get all MiniARC files
|
|
all_files = get_miniarc_files()
|
|
if not all_files:
|
|
print(f"✗ No MiniARC JSON files found")
|
|
return 1
|
|
|
|
print(f"✓ Found {len(all_files)} MiniARC JSON files")
|
|
|
|
# Show preview
|
|
preview_data(all_files)
|
|
|
|
if preview_mode:
|
|
print("\n[Preview mode - no data inserted]")
|
|
return 0
|
|
|
|
# Connect to database
|
|
try:
|
|
print(f"\nConnecting to database...")
|
|
connection = pymysql.connect(**config)
|
|
print(f"✓ Connected successfully")
|
|
except Exception as e:
|
|
print(f"✗ Database connection failed: {e}")
|
|
return 1
|
|
|
|
try:
|
|
cursor = connection.cursor()
|
|
|
|
# Check if tables exist
|
|
cursor.execute("SHOW TABLES LIKE 'arc_puzzles'")
|
|
if not cursor.fetchone():
|
|
print(f"✗ Table 'arc_puzzles' does not exist")
|
|
return 1
|
|
|
|
cursor.execute("SHOW TABLES LIKE 'arc_jsons'")
|
|
if not cursor.fetchone():
|
|
print(f"✗ Table 'arc_jsons' does not exist")
|
|
return 1
|
|
|
|
# Check if 'comment' column exists in arc_puzzles
|
|
cursor.execute("SHOW COLUMNS FROM arc_puzzles LIKE 'comment'")
|
|
if not cursor.fetchone():
|
|
print(f"\n⚠ Column 'comment' does not exist in arc_puzzles table")
|
|
print(f" Adding 'comment' column...")
|
|
cursor.execute("ALTER TABLE arc_puzzles ADD COLUMN comment VARCHAR(255) DEFAULT NULL")
|
|
connection.commit()
|
|
print(f"✓ Column 'comment' added successfully")
|
|
else:
|
|
print(f"✓ Column 'comment' exists in arc_puzzles table")
|
|
|
|
# Get current counts
|
|
cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'MiniARC'")
|
|
initial_puzzles_count = cursor.fetchone()[0]
|
|
|
|
print(f"✓ Tables exist")
|
|
print(f" Current MiniARC puzzles in arc_puzzles: {initial_puzzles_count}")
|
|
|
|
# Ask for confirmation (unless --yes flag is provided)
|
|
if '--yes' not in sys.argv:
|
|
print(f"\n⚠ About to insert {len(all_files)} MiniARC records")
|
|
response = input("Continue? (yes/no): ").strip().lower()
|
|
if response not in ['yes', 'y']:
|
|
print("Upload cancelled")
|
|
return 0
|
|
else:
|
|
print(f"\n⚠ About to insert {len(all_files)} records (auto-confirmed with --yes flag)")
|
|
|
|
print(f"\nInserting records...")
|
|
inserted = 0
|
|
errors = 0
|
|
|
|
for i, file_path in enumerate(all_files, 1):
|
|
try:
|
|
puzzle_id, comment = insert_puzzle(cursor, file_path)
|
|
inserted += 1
|
|
|
|
# Show progress every 20 records
|
|
if i % 20 == 0 or i == len(all_files):
|
|
print(f" Progress: {i}/{len(all_files)} ({inserted} inserted, {errors} errors)")
|
|
|
|
except pymysql.IntegrityError as e:
|
|
# Likely duplicate key
|
|
if "Duplicate entry" in str(e):
|
|
errors += 1
|
|
if errors <= 5: # Only show first 5 errors
|
|
print(f" ⚠ Duplicate: {file_path.name}")
|
|
else:
|
|
raise
|
|
except Exception as e:
|
|
errors += 1
|
|
print(f" ✗ Error with {file_path.name}: {e}")
|
|
if errors > 10:
|
|
print(f" Too many errors, stopping...")
|
|
break
|
|
|
|
# Commit the transaction
|
|
connection.commit()
|
|
|
|
# Get final counts
|
|
cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'MiniARC'")
|
|
final_puzzles_count = cursor.fetchone()[0]
|
|
|
|
print(f"\n{'=' * 50}")
|
|
print(f"✓ Upload complete!")
|
|
print(f" Successfully inserted: {inserted}")
|
|
print(f" Errors/duplicates: {errors}")
|
|
print(f" MiniARC puzzles: {initial_puzzles_count} → {final_puzzles_count} (+{final_puzzles_count - initial_puzzles_count})")
|
|
|
|
except Exception as e:
|
|
connection.rollback()
|
|
print(f"\n✗ Error during upload: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 1
|
|
finally:
|
|
connection.close()
|
|
print(f"\n✓ Database connection closed")
|
|
|
|
return 0
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|