initial

2025-11-05 00:24:05 +00:00
commit b8856c0660
1157 changed files with 26817 additions and 0 deletions
--- a/scripts/check_and_fix_column.py
+++ b/scripts/check_and_fix_column.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+import pymysql
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+
+config = {
+    'host': os.getenv('DB_HOST'),
+    'user': os.getenv('DB_USER'),
+    'password': os.getenv('DB_PASSWORD'),
+    'database': os.getenv('DB_NAME'),
+    'port': int(os.getenv('DB_PORT', 3306)),
+}
+
+conn = pymysql.connect(**config)
+cursor = conn.cursor()
+
+# Check column type
+cursor.execute("SHOW COLUMNS FROM arc_jsons WHERE Field = 'json'")
+result = cursor.fetchone()
+print(f"Current column type: {result[1]}")
+
+# Check if we need to alter it
+if 'text' in result[1].lower() and 'medium' not in result[1].lower() and 'long' not in result[1].lower():
+    print("\n⚠  Column is TEXT (max 65,535 bytes)")
+    print("   One file (4a21e3da.json) is ~69KB and failed to insert")
+    print("\nRecommended fix: ALTER TABLE arc_jsons MODIFY json MEDIUMTEXT;")
+    print("   MEDIUMTEXT supports up to 16MB")
+
+    response = input("\nApply fix now? (yes/no): ").strip().lower()
+    if response in ['yes', 'y']:
+        print("\nAltering column to MEDIUMTEXT...")
+        cursor.execute("ALTER TABLE arc_jsons MODIFY json MEDIUMTEXT")
+        conn.commit()
+        print("✓ Column altered successfully!")
+
+        # Now insert the failed record
+        print("\nRe-inserting failed record (4a21e3da)...")
+        with open('arc_data/evaluation/4a21e3da.json', 'r') as f:
+            json_content = f.read().strip()
+
+        cursor.execute("INSERT INTO arc_jsons (id, json) VALUES (%s, %s)", ('4a21e3da', json_content))
+        conn.commit()
+        print("✓ Record inserted successfully!")
+
+        # Final count
+        cursor.execute("SELECT COUNT(*) FROM arc_jsons")
+        count = cursor.fetchone()[0]
+        print(f"\n✓ Total records in database: {count}")
+else:
+    print(f"✓ Column type is sufficient: {result[1]}")
+
+conn.close()
--- a/scripts/extract_solutions.py
+++ b/scripts/extract_solutions.py
@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""
+Extract test outputs from ARC puzzle JSON and store in solution column
+This script adds a 'solution' column and populates it with the test output grid
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+import pymysql
+from dotenv import load_dotenv
+
+# Force unbuffered output
+sys.stdout.reconfigure(line_buffering=True)
+sys.stderr.reconfigure(line_buffering=True)
+
+
+def load_env_config():
+    """Load database configuration from .env file"""
+    load_dotenv()
+
+    config = {
+        'host': os.getenv('DB_HOST'),
+        'user': os.getenv('DB_USER'),
+        'password': os.getenv('DB_PASSWORD'),
+        'database': os.getenv('DB_NAME'),
+        'port': int(os.getenv('DB_PORT', 3306)),
+        'charset': 'utf8mb4'
+    }
+
+    return config
+
+
+def check_and_add_solution_column(cursor):
+    """Check if solution column exists, add it if not"""
+    cursor.execute("""
+        SELECT COLUMN_NAME
+        FROM INFORMATION_SCHEMA.COLUMNS
+        WHERE TABLE_SCHEMA = DATABASE()
+        AND TABLE_NAME = 'arc_jsons'
+        AND COLUMN_NAME = 'solution'
+    """)
+
+    if cursor.fetchone():
+        print("✓ Column 'solution' already exists")
+        return True
+
+    print("Adding 'solution' column to arc_jsons table...")
+    cursor.execute("ALTER TABLE arc_jsons ADD COLUMN solution JSON AFTER json")
+    print("✓ Column 'solution' added successfully")
+    return True
+
+
+def extract_test_output(json_content):
+    """Extract test output from puzzle JSON"""
+    try:
+        # Parse JSON (handle both string and object)
+        if isinstance(json_content, str):
+            puzzle_data = json.loads(json_content)
+        else:
+            puzzle_data = json_content
+
+        # Extract test output
+        if puzzle_data.get('test') and len(puzzle_data['test']) > 0:
+            test_case = puzzle_data['test'][0]
+            if test_case.get('output'):
+                return test_case['output']
+
+        return None
+    except Exception as e:
+        print(f"Error parsing JSON: {e}")
+        return None
+
+
+def update_solution(cursor, puzzle_id, json_content):
+    """Extract test output and update solution column"""
+    test_output = extract_test_output(json_content)
+
+    if test_output is None:
+        return False, "No test output found"
+
+    try:
+        # Convert to JSON string
+        solution_json = json.dumps(test_output)
+
+        # Update database
+        sql = "UPDATE arc_jsons SET solution = %s WHERE id = %s"
+        cursor.execute(sql, (solution_json, puzzle_id))
+
+        return True, None
+    except Exception as e:
+        return False, str(e)
+
+
+def main():
+    print("ARC Solution Extraction Tool")
+    print("=" * 60)
+    print("This script will:")
+    print("  1. Add a 'solution' column to arc_jsons (if needed)")
+    print("  2. Extract test outputs from JSON data")
+    print("  3. Store solutions in the new column")
+    print("=" * 60)
+
+    # Load configuration
+    try:
+        config = load_env_config()
+        print(f"\n✓ Loaded configuration from .env")
+        print(f"  Host: {config['host']}")
+        print(f"  Database: {config['database']}")
+        print(f"  User: {config['user']}")
+    except Exception as e:
+        print(f"✗ Error loading configuration: {e}")
+        return 1
+
+    # Connect to database
+    try:
+        print(f"\nConnecting to database...")
+        connection = pymysql.connect(**config)
+        print(f"✓ Connected successfully")
+    except Exception as e:
+        print(f"✗ Database connection failed: {e}")
+        return 1
+
+    try:
+        cursor = connection.cursor()
+
+        # Check if table exists
+        cursor.execute("SHOW TABLES LIKE 'arc_jsons'")
+        if not cursor.fetchone():
+            print(f"✗ Table 'arc_jsons' does not exist")
+            return 1
+
+        # Get current count
+        cursor.execute("SELECT COUNT(*) FROM arc_jsons")
+        total_count = cursor.fetchone()[0]
+        print(f"✓ Table 'arc_jsons' found ({total_count} records)")
+
+        # Check/add solution column
+        print()
+        check_and_add_solution_column(cursor)
+        connection.commit()
+
+        # Ask for confirmation (unless --yes flag is provided)
+        if '--yes' not in sys.argv:
+            print(f"\n⚠ About to process {total_count} records")
+            response = input("Continue? (yes/no): ").strip().lower()
+            if response not in ['yes', 'y']:
+                print("Extraction cancelled")
+                return 0
+        else:
+            print(f"\n⚠ About to process {total_count} records (auto-confirmed with --yes flag)")
+
+        # Fetch all records
+        print(f"\nFetching records...")
+        cursor.execute("SELECT id, json FROM arc_jsons")
+        records = cursor.fetchall()
+        print(f"✓ Retrieved {len(records)} records")
+
+        # Process each record
+        print(f"\nProcessing records...")
+        updated = 0
+        errors = 0
+        no_output = 0
+
+        for i, (puzzle_id, json_content) in enumerate(records, 1):
+            success, error = update_solution(cursor, puzzle_id, json_content)
+
+            if success:
+                updated += 1
+            elif error == "No test output found":
+                no_output += 1
+                if no_output <= 5:  # Show first 5 cases
+                    print(f"  ⚠ No output: {puzzle_id}")
+            else:
+                errors += 1
+                if errors <= 5:  # Show first 5 errors
+                    print(f"  ✗ Error {puzzle_id}: {error}")
+
+            # Show progress every 100 records
+            if i % 100 == 0 or i == len(records):
+                print(f"  Progress: {i}/{len(records)} ({updated} updated, {no_output} no output, {errors} errors)")
+
+        # Commit the transaction
+        connection.commit()
+
+        print(f"\n{'=' * 60}")
+        print(f"✓ Extraction complete!")
+        print(f"  Successfully updated: {updated}")
+        print(f"  No test output: {no_output}")
+        print(f"  Errors: {errors}")
+        print(f"  Total processed: {len(records)}")
+
+        # Show sample
+        if updated > 0:
+            print(f"\nSample record (first with solution):")
+            cursor.execute("SELECT id, solution FROM arc_jsons WHERE solution IS NOT NULL LIMIT 1")
+            sample = cursor.fetchone()
+            if sample:
+                sample_id, sample_solution = sample
+                solution_data = json.loads(sample_solution)
+                rows = len(solution_data)
+                cols = len(solution_data[0]) if rows > 0 else 0
+                print(f"  ID: {sample_id}")
+                print(f"  Solution grid: {cols}×{rows}")
+                print(f"  First row: {solution_data[0][:10]}..." if cols > 10 else f"  First row: {solution_data[0]}")
+
+    except Exception as e:
+        connection.rollback()
+        print(f"\n✗ Error during extraction: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    finally:
+        connection.close()
+        print(f"\n✓ Database connection closed")
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/scripts/generate_sql.py
+++ b/scripts/generate_sql.py
@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+"""
+Generate SQL INSERT statements for ARC puzzle data
+Reads all JSON files from arc_data/training and arc_data/evaluation
+and creates INSERT statements for the arc_jsons table
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+def escape_sql_string(s):
+    """Escape single quotes for SQL"""
+    return s.replace("'", "''").replace("\\", "\\\\")
+
+def generate_insert_statement(file_path):
+    """Generate INSERT statement for a single JSON file"""
+    # Extract ID from filename (without .json extension)
+    puzzle_id = Path(file_path).stem
+
+    # Read JSON content
+    with open(file_path, 'r') as f:
+        json_content = f.read().strip()
+
+    # Escape the JSON string for SQL
+    escaped_json = escape_sql_string(json_content)
+
+    # Generate INSERT statement
+    sql = f"INSERT INTO arc_jsons (id, json) VALUES ('{puzzle_id}', '{escaped_json}');"
+
+    return sql
+
+def main():
+    base_dir = Path('arc_data')
+
+    # Collect all JSON files
+    training_files = sorted(base_dir.glob('training/*.json'))
+    evaluation_files = sorted(base_dir.glob('evaluation/*.json'))
+
+    all_files = training_files + evaluation_files
+
+    print(f"-- Found {len(training_files)} training files")
+    print(f"-- Found {len(evaluation_files)} evaluation files")
+    print(f"-- Total: {len(all_files)} files")
+    print()
+
+    # Check if user wants preview mode
+    preview_mode = '--preview' in sys.argv
+
+    if preview_mode:
+        print("-- PREVIEW MODE: Showing first 5 INSERT statements")
+        print()
+        files_to_show = all_files[:5]
+    else:
+        print("-- Generating all INSERT statements...")
+        print()
+        files_to_show = all_files
+
+    # Generate INSERT statements
+    for i, file_path in enumerate(files_to_show, 1):
+        try:
+            sql = generate_insert_statement(file_path)
+            print(sql)
+        except Exception as e:
+            print(f"-- ERROR processing {file_path}: {e}", file=sys.stderr)
+
+    if preview_mode:
+        print()
+        print(f"-- ... and {len(all_files) - 5} more")
+        print()
+        print("-- To generate all statements, run: python3 generate_sql.py > insert_arc_data.sql")
+        print("-- To see more preview: python3 generate_sql.py --preview")
+    else:
+        print()
+        print(f"-- Successfully generated {len(all_files)} INSERT statements")
+
+if __name__ == '__main__':
+    main()
--- a/scripts/generate_v1_csv.py
+++ b/scripts/generate_v1_csv.py
@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""
+Generate a CSV file with ARC V1 task IDs and their set (training/evaluation)
+"""
+
+import csv
+
+def main():
+    # Read the training IDs
+    with open('arc_v1_training_ids.txt', 'r') as f:
+        training_ids = [line.strip() for line in f if line.strip()]
+
+    # Read the evaluation IDs
+    with open('arc_v1_evaluation_ids.txt', 'r') as f:
+        evaluation_ids = [line.strip() for line in f if line.strip()]
+
+    # Create CSV file
+    with open('arc_v1_task_ids.csv', 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+
+        # Write header
+        writer.writerow(['id', 'set'])
+
+        # Write training tasks
+        for task_id in training_ids:
+            writer.writerow([task_id, 'training'])
+
+        # Write evaluation tasks
+        for task_id in evaluation_ids:
+            writer.writerow([task_id, 'evaluation'])
+
+    print("=" * 80)
+    print("ARC V1 Task IDs - CSV Export")
+    print("=" * 80)
+    print(f"\nTraining Tasks: {len(training_ids)}")
+    print(f"Evaluation Tasks: {len(evaluation_ids)}")
+    print(f"Total: {len(training_ids) + len(evaluation_ids)}")
+
+    print("\nFirst 10 rows (preview):")
+    print("-" * 40)
+    print("id,set")
+    for task_id in training_ids[:5]:
+        print(f"{task_id},training")
+    for task_id in evaluation_ids[:5]:
+        print(f"{task_id},evaluation")
+
+    print("\n✓ Saved to: arc_v1_task_ids.csv")
+    print("=" * 80)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/generate_v1_list.py
+++ b/scripts/generate_v1_list.py
@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""
+Generate a comprehensive JSON file with official ARC V1 task IDs
+"""
+
+import json
+
+def main():
+    # Read the training IDs
+    with open('arc_v1_training_ids.txt', 'r') as f:
+        training_ids = [line.strip() for line in f if line.strip()]
+
+    # Read the evaluation IDs
+    with open('arc_v1_evaluation_ids.txt', 'r') as f:
+        evaluation_ids = [line.strip() for line in f if line.strip()]
+
+    # Create comprehensive JSON
+    data = {
+        "version": "ARC-AGI-1 (Official)",
+        "source": "https://github.com/fchollet/ARC-AGI v1.0.2",
+        "description": "Official ARC-AGI Version 1 task IDs - 400 training + 400 evaluation tasks",
+        "total_tasks": len(training_ids) + len(evaluation_ids),
+        "training": {
+            "count": len(training_ids),
+            "task_ids": training_ids
+        },
+        "evaluation": {
+            "count": len(evaluation_ids),
+            "task_ids": evaluation_ids
+        },
+        "all_task_ids": sorted(training_ids + evaluation_ids)
+    }
+
+    # Save to JSON
+    with open('arc_v1_official_task_ids.json', 'w') as f:
+        json.dump(data, f, indent=2)
+
+    print("=" * 80)
+    print("ARC V1 Official Task IDs")
+    print("=" * 80)
+    print(f"\nTraining Tasks: {len(training_ids)}")
+    print(f"Evaluation Tasks: {len(evaluation_ids)}")
+    print(f"Total: {len(training_ids) + len(evaluation_ids)}")
+
+    print("\nFirst 10 Training IDs:")
+    for task_id in training_ids[:10]:
+        print(f"  {task_id}")
+
+    print("\nFirst 10 Evaluation IDs:")
+    for task_id in evaluation_ids[:10]:
+        print(f"  {task_id}")
+
+    print("\n✓ Saved to: arc_v1_official_task_ids.json")
+    print("=" * 80)
+
+    # Also create a simple combined text file
+    with open('arc_v1_all_ids.txt', 'w') as f:
+        f.write("# ARC-AGI Version 1 Official Task IDs\n")
+        f.write("# Source: https://github.com/fchollet/ARC-AGI v1.0.2\n")
+        f.write(f"# Total: {len(training_ids) + len(evaluation_ids)} tasks\n\n")
+        f.write("## Training Tasks (400)\n")
+        for task_id in training_ids:
+            f.write(f"{task_id}\n")
+        f.write("\n## Evaluation Tasks (400)\n")
+        for task_id in evaluation_ids:
+            f.write(f"{task_id}\n")
+
+    print("✓ Saved to: arc_v1_all_ids.txt")
+
+if __name__ == '__main__':
+    main()
--- a/scripts/generate_v2_csv.py
+++ b/scripts/generate_v2_csv.py
@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+Generate a CSV file with ARC V2 task IDs, set, and difficulty
+"""
+
+import csv
+import json
+
+def main():
+    # Read the training IDs
+    with open('arc_v2_training_ids.txt', 'r') as f:
+        training_ids = [line.strip() for line in f if line.strip()]
+
+    # Read the evaluation IDs
+    with open('arc_v2_evaluation_ids.txt', 'r') as f:
+        evaluation_ids = [line.strip() for line in f if line.strip()]
+
+    # Create CSV file
+    with open('arc_v2_task_ids.csv', 'w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+
+        # Write header
+        writer.writerow(['id', 'set', 'difficulty'])
+
+        # Write training tasks (training = easy per official labeling)
+        for task_id in training_ids:
+            writer.writerow([task_id, 'training', 'easy'])
+
+        # Write evaluation tasks (evaluation = hard per official labeling)
+        for task_id in evaluation_ids:
+            writer.writerow([task_id, 'evaluation', 'hard'])
+
+    print("=" * 80)
+    print("ARC-AGI-2 Task IDs - CSV Export")
+    print("=" * 80)
+    print(f"\nTraining Tasks: {len(training_ids)} (labeled: easy)")
+    print(f"Evaluation Tasks: {len(evaluation_ids)} (labeled: hard)")
+    print(f"Total: {len(training_ids) + len(evaluation_ids)}")
+
+    print("\nFirst 10 rows (preview):")
+    print("-" * 50)
+    print("id,set,difficulty")
+    for task_id in training_ids[:5]:
+        print(f"{task_id},training,easy")
+    for task_id in evaluation_ids[:5]:
+        print(f"{task_id},evaluation,hard")
+
+    print("\n✓ Saved to: arc_v2_task_ids.csv")
+
+    # Also create a JSON version
+    data = {
+        "version": "ARC-AGI-2",
+        "source": "https://github.com/arcprize/ARC-AGI-2",
+        "description": "ARC-AGI Version 2 task IDs - 1000 training + 120 evaluation tasks",
+        "total_tasks": len(training_ids) + len(evaluation_ids),
+        "training": {
+            "count": len(training_ids),
+            "difficulty": "easy",
+            "task_ids": training_ids
+        },
+        "evaluation": {
+            "count": len(evaluation_ids),
+            "difficulty": "hard",
+            "task_ids": evaluation_ids
+        },
+        "all_task_ids": sorted(training_ids + evaluation_ids)
+    }
+
+    with open('arc_v2_official_task_ids.json', 'w') as f:
+        json.dump(data, f, indent=2)
+
+    print("✓ Saved to: arc_v2_official_task_ids.json")
+    print("=" * 80)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/list_v1_task_ids.py
+++ b/scripts/list_v1_task_ids.py
@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""
+Generate a list of all ARC V1 task IDs from the local dataset
+"""
+
+import os
+import json
+
+def get_task_ids_from_directory(directory):
+    """Get all task IDs (filenames without .json) from a directory"""
+    task_ids = []
+    if os.path.exists(directory):
+        for filename in sorted(os.listdir(directory)):
+            if filename.endswith('.json'):
+                task_id = filename[:-5]  # Remove .json extension
+                task_ids.append(task_id)
+    return task_ids
+
+def main():
+    # Get task IDs from training and evaluation directories
+    training_dir = 'arc_data/training'
+    evaluation_dir = 'arc_data/evaluation'
+
+    training_ids = get_task_ids_from_directory(training_dir)
+    evaluation_ids = get_task_ids_from_directory(evaluation_dir)
+
+    print("ARC V1 Task IDs")
+    print("=" * 80)
+    print(f"\nTraining Tasks: {len(training_ids)} tasks")
+    print(f"Evaluation Tasks: {len(evaluation_ids)} tasks")
+    print(f"Total: {len(training_ids) + len(evaluation_ids)} tasks")
+
+    # Save to JSON file
+    output_data = {
+        "version": "ARC-AGI-1",
+        "total_tasks": len(training_ids) + len(evaluation_ids),
+        "training": {
+            "count": len(training_ids),
+            "task_ids": training_ids
+        },
+        "evaluation": {
+            "count": len(evaluation_ids),
+            "task_ids": evaluation_ids
+        },
+        "all_task_ids": sorted(training_ids + evaluation_ids)
+    }
+
+    with open('arc_v1_task_ids.json', 'w') as f:
+        json.dump(output_data, f, indent=2)
+
+    print(f"\n✓ Saved complete list to: arc_v1_task_ids.json")
+
+    # Also save a simple text list
+    with open('arc_v1_task_ids.txt', 'w') as f:
+        f.write("# ARC V1 Training Task IDs\n")
+        for task_id in training_ids:
+            f.write(f"{task_id}\n")
+        f.write("\n# ARC V1 Evaluation Task IDs\n")
+        for task_id in evaluation_ids:
+            f.write(f"{task_id}\n")
+
+    print(f"✓ Saved text list to: arc_v1_task_ids.txt")
+
+    # Display first 10 from each set as preview
+    print("\n" + "-" * 80)
+    print("Preview - First 10 Training Task IDs:")
+    for task_id in training_ids[:10]:
+        print(f"  {task_id}")
+
+    print("\nPreview - First 10 Evaluation Task IDs:")
+    for task_id in evaluation_ids[:10]:
+        print(f"  {task_id}")
+
+    print("\n" + "=" * 80)
+
+if __name__ == '__main__':
+    main()
--- a/scripts/upload_to_db.py
+++ b/scripts/upload_to_db.py
@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+"""
+Upload ARC puzzle data to MariaDB database
+Reads credentials from .env file and inserts all JSON files
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+import pymysql
+from dotenv import load_dotenv
+
+# Force unbuffered output
+sys.stdout.reconfigure(line_buffering=True)
+sys.stderr.reconfigure(line_buffering=True)
+
+def load_env_config():
+    """Load database configuration from .env file"""
+    load_dotenv()
+
+    config = {
+        'host': os.getenv('DB_HOST'),
+        'user': os.getenv('DB_USER'),
+        'password': os.getenv('DB_PASSWORD'),
+        'database': os.getenv('DB_NAME'),
+        'port': int(os.getenv('DB_PORT', 3306)),
+        'charset': 'utf8mb4'
+    }
+
+    return config
+
+def get_all_json_files():
+    """Get all JSON files from training and evaluation directories"""
+    base_dir = Path('arc_data')
+
+    training_files = sorted(base_dir.glob('training/*.json'))
+    evaluation_files = sorted(base_dir.glob('evaluation/*.json'))
+
+    return training_files + evaluation_files
+
+def insert_puzzle(cursor, file_path):
+    """Insert a single puzzle into the database"""
+    # Extract ID from filename
+    puzzle_id = file_path.stem
+
+    # Read JSON content
+    with open(file_path, 'r') as f:
+        json_content = f.read().strip()
+
+    # Insert into database using parameterized query (prevents SQL injection)
+    sql = "INSERT INTO arc_jsons (id, json) VALUES (%s, %s)"
+    cursor.execute(sql, (puzzle_id, json_content))
+
+    return puzzle_id
+
+def main():
+    print("ARC Data Upload to MariaDB")
+    print("=" * 50)
+
+    # Load configuration
+    try:
+        config = load_env_config()
+        print(f"✓ Loaded configuration from .env")
+        print(f"  Host: {config['host']}")
+        print(f"  Database: {config['database']}")
+        print(f"  User: {config['user']}")
+    except Exception as e:
+        print(f"✗ Error loading configuration: {e}")
+        return 1
+
+    # Get all files
+    all_files = get_all_json_files()
+    print(f"✓ Found {len(all_files)} JSON files")
+
+    # Connect to database
+    try:
+        print(f"\nConnecting to database...")
+        connection = pymysql.connect(**config)
+        print(f"✓ Connected successfully")
+    except Exception as e:
+        print(f"✗ Database connection failed: {e}")
+        return 1
+
+    try:
+        cursor = connection.cursor()
+
+        # Check if table exists
+        cursor.execute("SHOW TABLES LIKE 'arc_jsons'")
+        if not cursor.fetchone():
+            print(f"✗ Table 'arc_jsons' does not exist")
+            return 1
+
+        # Get current count
+        cursor.execute("SELECT COUNT(*) FROM arc_jsons")
+        initial_count = cursor.fetchone()[0]
+        print(f"✓ Table 'arc_jsons' exists (current rows: {initial_count})")
+
+        # Ask for confirmation (unless --yes flag is provided)
+        if '--yes' not in sys.argv:
+            print(f"\n⚠ About to insert {len(all_files)} records")
+            response = input("Continue? (yes/no): ").strip().lower()
+            if response not in ['yes', 'y']:
+                print("Upload cancelled")
+                return 0
+        else:
+            print(f"\n⚠ About to insert {len(all_files)} records (auto-confirmed with --yes flag)")
+
+        print(f"\nInserting records...")
+        inserted = 0
+        errors = 0
+
+        for i, file_path in enumerate(all_files, 1):
+            try:
+                puzzle_id = insert_puzzle(cursor, file_path)
+                inserted += 1
+
+                # Show progress every 100 records
+                if i % 100 == 0 or i == len(all_files):
+                    print(f"  Progress: {i}/{len(all_files)} ({inserted} inserted, {errors} errors)")
+
+            except pymysql.IntegrityError as e:
+                # Likely duplicate key
+                if "Duplicate entry" in str(e):
+                    errors += 1
+                    if errors <= 5:  # Only show first 5 errors
+                        print(f"  ⚠ Duplicate: {file_path.stem}")
+                else:
+                    raise
+            except Exception as e:
+                errors += 1
+                print(f"  ✗ Error with {file_path.stem}: {e}")
+                if errors > 10:
+                    print(f"  Too many errors, stopping...")
+                    break
+
+        # Commit the transaction
+        connection.commit()
+
+        # Get final count
+        cursor.execute("SELECT COUNT(*) FROM arc_jsons")
+        final_count = cursor.fetchone()[0]
+
+        print(f"\n{'=' * 50}")
+        print(f"✓ Upload complete!")
+        print(f"  Successfully inserted: {inserted}")
+        print(f"  Errors/duplicates: {errors}")
+        print(f"  Database rows: {initial_count} → {final_count} (+{final_count - initial_count})")
+
+    except Exception as e:
+        connection.rollback()
+        print(f"\n✗ Error during upload: {e}")
+        return 1
+    finally:
+        connection.close()
+        print(f"\n✓ Database connection closed")
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/scripts/verify_solutions.py
+++ b/scripts/verify_solutions.py
@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""
+Verify that solutions were extracted successfully
+Shows sample solutions from the database
+"""
+
+import json
+import os
+import sys
+import pymysql
+from dotenv import load_dotenv
+
+
+def load_env_config():
+    """Load database configuration from .env file"""
+    load_dotenv()
+
+    config = {
+        'host': os.getenv('DB_HOST'),
+        'user': os.getenv('DB_USER'),
+        'password': os.getenv('DB_PASSWORD'),
+        'database': os.getenv('DB_NAME'),
+        'port': int(os.getenv('DB_PORT', 3306)),
+        'charset': 'utf8mb4'
+    }
+
+    return config
+
+
+def main():
+    print("Solution Verification Tool")
+    print("=" * 60)
+
+    # Load configuration and connect
+    config = load_env_config()
+    connection = pymysql.connect(**config)
+    cursor = connection.cursor()
+
+    # Get statistics
+    cursor.execute("SELECT COUNT(*) FROM arc_jsons")
+    total_records = cursor.fetchone()[0]
+
+    cursor.execute("SELECT COUNT(*) FROM arc_jsons WHERE solution IS NOT NULL")
+    with_solutions = cursor.fetchone()[0]
+
+    cursor.execute("SELECT COUNT(*) FROM arc_jsons WHERE solution IS NULL")
+    without_solutions = cursor.fetchone()[0]
+
+    print(f"\nDatabase Statistics:")
+    print(f"  Total records: {total_records}")
+    print(f"  With solutions: {with_solutions}")
+    print(f"  Without solutions: {without_solutions}")
+
+    # Show 5 sample solutions
+    print(f"\nSample Solutions:")
+    print("-" * 60)
+
+    cursor.execute("SELECT id, solution FROM arc_jsons WHERE solution IS NOT NULL LIMIT 5")
+    samples = cursor.fetchall()
+
+    for i, (puzzle_id, solution_json) in enumerate(samples, 1):
+        solution = json.loads(solution_json)
+        rows = len(solution)
+        cols = len(solution[0]) if rows > 0 else 0
+
+        print(f"\n{i}. Puzzle ID: {puzzle_id}")
+        print(f"   Grid size: {cols}×{rows}")
+        print(f"   Grid data:")
+
+        # Show grid visually
+        for row in solution[:5]:  # Show up to 5 rows
+            print(f"     {row}")
+
+        if rows > 5:
+            print(f"     ... ({rows - 5} more rows)")
+
+    connection.close()
+    print(f"\n{'=' * 60}")
+    print("Verification complete!")
+
+
+if __name__ == '__main__':
+    main()