From fa5f12fd47341479c286aae57fb62cdd741a4f47 Mon Sep 17 00:00:00 2001 From: bmachado Date: Wed, 5 Nov 2025 13:28:23 +0000 Subject: [PATCH] ConceptARC db upload --- .env | 10 ++ scripts/check_arc_puzzles.py | 83 ++++++++++ scripts/check_schema.py | 117 ++++++++++++++ scripts/find_duplicates.py | 82 ++++++++++ scripts/fix_conceptarc_solutions.py | 155 ++++++++++++++++++ scripts/remove_duplicates.py | 146 +++++++++++++++++ scripts/upload_conceptarc_to_db.py | 240 ++++++++++++++++++++++++++++ scripts/verify_conceptarc_upload.py | 86 ++++++++++ scripts/verify_solution_format.py | 131 +++++++++++++++ todo.md | 12 +- 10 files changed, 1061 insertions(+), 1 deletion(-) create mode 100644 .env create mode 100644 scripts/check_arc_puzzles.py create mode 100755 scripts/check_schema.py create mode 100644 scripts/find_duplicates.py create mode 100755 scripts/fix_conceptarc_solutions.py create mode 100755 scripts/remove_duplicates.py create mode 100755 scripts/upload_conceptarc_to_db.py create mode 100755 scripts/verify_conceptarc_upload.py create mode 100644 scripts/verify_solution_format.py diff --git a/.env b/.env new file mode 100644 index 0000000..52faeb2 --- /dev/null +++ b/.env @@ -0,0 +1,10 @@ +# Copy this file to .env and fill in the values +# Database configuration +DB_HOST=mariadb.vialink.com.br +DB_USER=bp_solver +DB_PASSWORD=A0bn%3H5sHl7C8Jk +DB_NAME=arc +DB_PORT=3306 + +# Add other secrets here as needed +# e.g., API keys: OPENAI_API_KEY=... diff --git a/scripts/check_arc_puzzles.py b/scripts/check_arc_puzzles.py new file mode 100644 index 0000000..f097478 --- /dev/null +++ b/scripts/check_arc_puzzles.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +""" +Check arc_puzzles table for ConceptArc entries +""" + +import os +import pymysql +from dotenv import load_dotenv + +def load_env_config(): + """Load database configuration from .env file""" + load_dotenv() + return { + 'host': os.getenv('DB_HOST'), + 'user': os.getenv('DB_USER'), + 'password': os.getenv('DB_PASSWORD'), + 'database': os.getenv('DB_NAME'), + 'port': int(os.getenv('DB_PORT', 3306)), + 'charset': 'utf8mb4' + } + +def main(): + print("Checking arc_puzzles table") + print("=" * 50) + + config = load_env_config() + connection = pymysql.connect(**config) + cursor = connection.cursor() + + # Check total counts by corpora + cursor.execute(""" + SELECT corpora, COUNT(*) as count + FROM arc_puzzles + GROUP BY corpora + ORDER BY corpora + """) + + print("\nTotal puzzles by corpora:") + for corpora, count in cursor.fetchall(): + print(f" {corpora}: {count}") + + # Check ConceptArc entries in detail + cursor.execute(""" + SELECT id, corpora, `set`, difficulty, version + FROM arc_puzzles + WHERE corpora = 'ConceptArc' + ORDER BY `set`, id + LIMIT 20 + """) + + print("\n" + "=" * 50) + print("First 20 ConceptArc entries in arc_puzzles:") + print("=" * 50) + print(f"{'ID':<20} {'Corpora':<15} {'Set':<20} {'Difficulty':<12} {'Version':<10}") + print("-" * 80) + + rows = cursor.fetchall() + if rows: + for row in rows: + puzzle_id, corpora, set_name, difficulty, version = row + print(f"{puzzle_id:<20} {corpora:<15} {set_name:<20} {str(difficulty):<12} {str(version):<10}") + else: + print("No ConceptArc entries found!") + + # Count by set + cursor.execute(""" + SELECT `set`, COUNT(*) as count + FROM arc_puzzles + WHERE corpora = 'ConceptArc' + GROUP BY `set` + ORDER BY `set` + """) + + print("\n" + "=" * 50) + print("ConceptArc puzzles by category:") + print("=" * 50) + for set_name, count in cursor.fetchall(): + print(f" {set_name}: {count}") + + connection.close() + +if __name__ == '__main__': + main() diff --git a/scripts/check_schema.py b/scripts/check_schema.py new file mode 100755 index 0000000..bd4445c --- /dev/null +++ b/scripts/check_schema.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +""" +Check the database schema for arc_puzzles table +""" + +import os +import sys +import pymysql +from dotenv import load_dotenv + +def load_env_config(): + """Load database configuration from .env file""" + load_dotenv() + + config = { + 'host': os.getenv('DB_HOST'), + 'user': os.getenv('DB_USER'), + 'password': os.getenv('DB_PASSWORD'), + 'database': os.getenv('DB_NAME'), + 'port': int(os.getenv('DB_PORT', 3306)), + 'charset': 'utf8mb4' + } + + return config + +def main(): + print("Checking Database Schema") + print("=" * 50) + + # Load configuration + try: + config = load_env_config() + print(f"✓ Loaded configuration from .env") + print(f" Host: {config['host']}") + print(f" Database: {config['database']}") + print(f" User: {config['user']}") + except Exception as e: + print(f"✗ Error loading configuration: {e}") + return 1 + + # Connect to database + try: + print(f"\nConnecting to database...") + connection = pymysql.connect(**config) + print(f"✓ Connected successfully") + except Exception as e: + print(f"✗ Database connection failed: {e}") + return 1 + + try: + cursor = connection.cursor() + + # Show all tables + print("\n" + "=" * 50) + print("TABLES IN DATABASE:") + print("=" * 50) + cursor.execute("SHOW TABLES") + tables = cursor.fetchall() + for table in tables: + print(f" - {table[0]}") + + # Check for arc_puzzles table + cursor.execute("SHOW TABLES LIKE 'arc_puzzles'") + if cursor.fetchone(): + print("\n" + "=" * 50) + print("SCHEMA FOR 'arc_puzzles' TABLE:") + print("=" * 50) + cursor.execute("DESCRIBE arc_puzzles") + columns = cursor.fetchall() + for col in columns: + print(f" {col[0]:<20} {col[1]:<20} Null:{col[2]} Key:{col[3]} Default:{col[4]}") + + # Get sample data + cursor.execute("SELECT * FROM arc_puzzles LIMIT 3") + print("\n" + "=" * 50) + print("SAMPLE DATA (first 3 rows):") + print("=" * 50) + rows = cursor.fetchall() + if rows: + # Get column names + cursor.execute("DESCRIBE arc_puzzles") + columns = [col[0] for col in cursor.fetchall()] + print(" Columns:", ", ".join(columns)) + for i, row in enumerate(rows, 1): + print(f"\n Row {i}:") + for col_name, value in zip(columns, row): + if col_name == 'json': + print(f" {col_name}: [JSON data, length={len(str(value))}]") + else: + print(f" {col_name}: {value}") + else: + print(" (No data in table)") + else: + print("\n✗ Table 'arc_puzzles' does not exist") + + # Check for arc_jsons table + cursor.execute("SHOW TABLES LIKE 'arc_jsons'") + if cursor.fetchone(): + print("\n" + "=" * 50) + print("SCHEMA FOR 'arc_jsons' TABLE:") + print("=" * 50) + cursor.execute("DESCRIBE arc_jsons") + columns = cursor.fetchall() + for col in columns: + print(f" {col[0]:<20} {col[1]:<20} Null:{col[2]} Key:{col[3]} Default:{col[4]}") + + except Exception as e: + print(f"\n✗ Error querying database: {e}") + return 1 + finally: + connection.close() + print(f"\n✓ Database connection closed") + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/scripts/find_duplicates.py b/scripts/find_duplicates.py new file mode 100644 index 0000000..d484957 --- /dev/null +++ b/scripts/find_duplicates.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" +Find duplicate puzzle IDs in arc_puzzles table +""" + +import os +import pymysql +from dotenv import load_dotenv + +def load_env_config(): + """Load database configuration from .env file""" + load_dotenv() + return { + 'host': os.getenv('DB_HOST'), + 'user': os.getenv('DB_USER'), + 'password': os.getenv('DB_PASSWORD'), + 'database': os.getenv('DB_NAME'), + 'port': int(os.getenv('DB_PORT', 3306)), + 'charset': 'utf8mb4' + } + +def main(): + print("Finding Duplicate IDs in arc_puzzles") + print("=" * 50) + + config = load_env_config() + connection = pymysql.connect(**config) + cursor = connection.cursor() + + # Find all duplicate IDs + cursor.execute(""" + SELECT id, corpora, `set`, COUNT(*) as count + FROM arc_puzzles + GROUP BY id, corpora, `set` + HAVING COUNT(*) > 1 + ORDER BY count DESC, id + """) + + duplicates = cursor.fetchall() + + if duplicates: + print(f"\n⚠ Found {len(duplicates)} duplicate entries:") + print(f"{'ID':<20} {'Corpora':<15} {'Set':<20} {'Count':<10}") + print("-" * 70) + for puzzle_id, corpora, set_name, count in duplicates: + print(f"{puzzle_id:<20} {corpora:<15} {set_name:<20} {count:<10}") + + # Calculate totals + total_duplicates = sum(count - 1 for _, _, _, count in duplicates) + print(f"\nTotal duplicate rows to remove: {total_duplicates}") + + # Check ConceptArc specifically + conceptarc_dups = [d for d in duplicates if d[1] == 'ConceptArc'] + if conceptarc_dups: + print(f"\nConceptArc duplicates: {len(conceptarc_dups)} unique IDs") + conceptarc_dup_count = sum(count - 1 for _, _, _, count in conceptarc_dups) + print(f"ConceptArc duplicate rows to remove: {conceptarc_dup_count}") + else: + print("\n✓ No duplicates found!") + + # Show total counts + print("\n" + "=" * 50) + print("Current table statistics:") + print("=" * 50) + cursor.execute("SELECT COUNT(*) FROM arc_puzzles") + total = cursor.fetchone()[0] + print(f"Total rows in arc_puzzles: {total}") + + cursor.execute(""" + SELECT corpora, COUNT(*) as count + FROM arc_puzzles + GROUP BY corpora + ORDER BY corpora + """) + print("\nBy corpora:") + for corpora, count in cursor.fetchall(): + print(f" {corpora}: {count}") + + connection.close() + +if __name__ == '__main__': + main() diff --git a/scripts/fix_conceptarc_solutions.py b/scripts/fix_conceptarc_solutions.py new file mode 100755 index 0000000..bd3b17b --- /dev/null +++ b/scripts/fix_conceptarc_solutions.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +""" +Fix ConceptArc solutions in the database to include ALL test outputs, not just the last one +""" + +import json +import os +import sys +from pathlib import Path +import pymysql +from dotenv import load_dotenv + +def load_env_config(): + """Load database configuration from .env file""" + load_dotenv() + return { + 'host': os.getenv('DB_HOST'), + 'user': os.getenv('DB_USER'), + 'password': os.getenv('DB_PASSWORD'), + 'database': os.getenv('DB_NAME'), + 'port': int(os.getenv('DB_PORT', 3306)), + 'charset': 'utf8mb4' + } + +def main(): + print("Fixing ConceptArc Solutions") + print("=" * 50) + + config = load_env_config() + connection = pymysql.connect(**config) + cursor = connection.cursor() + + try: + # Get all ConceptArc entries + cursor.execute(""" + SELECT aj.id, aj.arc_puzzle_id, aj.json, aj.solution + FROM arc_jsons aj + JOIN arc_puzzles ap ON aj.arc_puzzle_id = ap.id + WHERE ap.corpora = 'ConceptArc' + ORDER BY aj.arc_puzzle_id + """) + + entries = cursor.fetchall() + print(f"Found {len(entries)} ConceptArc entries to check") + + if not entries: + print("No ConceptArc entries found!") + return 0 + + # Ask for confirmation + if '--yes' not in sys.argv: + response = input(f"\nUpdate solutions for {len(entries)} entries? (yes/no): ").strip().lower() + if response not in ['yes', 'y']: + print("Operation cancelled") + return 0 + else: + print(f"Updating solutions (auto-confirmed with --yes flag)") + + updated = 0 + errors = 0 + skipped = 0 + + print("\nProcessing entries...") + + for row_id, puzzle_id, json_str, current_solution in entries: + try: + # Parse the puzzle JSON + puzzle_data = json.loads(json_str) + + # Extract all test outputs + all_outputs = [] + if 'test' in puzzle_data: + for test_case in puzzle_data['test']: + if 'output' in test_case: + all_outputs.append(test_case['output']) + + if not all_outputs: + print(f" ⚠ {puzzle_id}: No test outputs found") + skipped += 1 + continue + + # Create new solution as array of all outputs + new_solution = json.dumps(all_outputs) + + # Check if it's different from current + if current_solution == new_solution: + skipped += 1 + continue + + # Update the solution + cursor.execute(""" + UPDATE arc_jsons + SET solution = %s + WHERE id = %s + """, (new_solution, row_id)) + + updated += 1 + + if updated % 20 == 0: + print(f" Updated: {updated}/{len(entries)}") + + except Exception as e: + errors += 1 + print(f" ✗ Error with {puzzle_id}: {e}") + if errors > 10: + print("Too many errors, stopping...") + break + + # Commit changes + connection.commit() + + print(f"\n{'=' * 50}") + print(f"✓ Solution update complete!") + print(f" Updated: {updated}") + print(f" Skipped (unchanged): {skipped}") + print(f" Errors: {errors}") + + # Show a sample of updated solutions + if updated > 0: + cursor.execute(""" + SELECT aj.arc_puzzle_id, aj.solution + FROM arc_jsons aj + JOIN arc_puzzles ap ON aj.arc_puzzle_id = ap.id + WHERE ap.corpora = 'ConceptArc' + LIMIT 3 + """) + + print(f"\n{'=' * 50}") + print("Sample updated entries:") + print("=" * 50) + + for puzzle_id, solution in cursor.fetchall(): + if solution: + sol_data = json.loads(solution) + print(f"\n{puzzle_id}:") + print(f" Number of test outputs: {len(sol_data)}") + if isinstance(sol_data, list) and len(sol_data) > 0: + first_output = sol_data[0] + if isinstance(first_output, list): + print(f" First output dimensions: {len(first_output)}x{len(first_output[0]) if first_output else 0}") + + except Exception as e: + connection.rollback() + print(f"\n✗ Error: {e}") + import traceback + traceback.print_exc() + return 1 + finally: + connection.close() + print(f"\n✓ Database connection closed") + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/scripts/remove_duplicates.py b/scripts/remove_duplicates.py new file mode 100755 index 0000000..7b5097f --- /dev/null +++ b/scripts/remove_duplicates.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +""" +Remove duplicate entries from arc_puzzles table +Keeps only the first occurrence of each unique (id, corpora, set) combination +""" + +import os +import sys +import pymysql +from dotenv import load_dotenv + +def load_env_config(): + """Load database configuration from .env file""" + load_dotenv() + return { + 'host': os.getenv('DB_HOST'), + 'user': os.getenv('DB_USER'), + 'password': os.getenv('DB_PASSWORD'), + 'database': os.getenv('DB_NAME'), + 'port': int(os.getenv('DB_PORT', 3306)), + 'charset': 'utf8mb4' + } + +def main(): + print("Removing Duplicates from arc_puzzles Table") + print("=" * 50) + + config = load_env_config() + connection = pymysql.connect(**config) + cursor = connection.cursor() + + try: + # First, check if arc_puzzles has a primary key or unique identifier + cursor.execute("DESCRIBE arc_puzzles") + columns = cursor.fetchall() + print("\nTable structure:") + for col in columns: + print(f" {col[0]:<20} {col[1]:<20} Key:{col[3]}") + + # Find duplicates + cursor.execute(""" + SELECT id, corpora, `set`, COUNT(*) as count + FROM arc_puzzles + GROUP BY id, corpora, `set` + HAVING COUNT(*) > 1 + ORDER BY id + """) + + duplicates = cursor.fetchall() + + if not duplicates: + print("\n✓ No duplicates found!") + return 0 + + print(f"\nFound {len(duplicates)} sets of duplicates") + total_to_remove = sum(count - 1 for _, _, _, count in duplicates) + print(f"Total rows to remove: {total_to_remove}") + + # Ask for confirmation + if '--yes' not in sys.argv: + print(f"\n⚠ This will delete {total_to_remove} duplicate rows") + response = input("Continue? (yes/no): ").strip().lower() + if response not in ['yes', 'y']: + print("Operation cancelled") + return 0 + else: + print(f"\n⚠ Deleting {total_to_remove} duplicate rows (auto-confirmed with --yes flag)") + + # For each duplicate set, keep only one and delete the rest + # Since there's no auto-increment primary key, we'll use a different approach + # We'll create a temporary table with unique entries, then replace the original + + print("\nRemoving duplicates...") + removed_count = 0 + + for puzzle_id, corpora, set_name, count in duplicates: + if count > 1: + # Keep one, delete the extras + # We delete (count - 1) duplicates + delete_count = count - 1 + + # Delete using LIMIT to remove only the extra copies + cursor.execute(""" + DELETE FROM arc_puzzles + WHERE id = %s AND corpora = %s AND `set` = %s + LIMIT %s + """, (puzzle_id, corpora, set_name, delete_count)) + + removed_count += cursor.rowcount + + if removed_count % 50 == 0: + print(f" Removed {removed_count}/{total_to_remove} duplicates...") + + # Commit the changes + connection.commit() + + print(f"\n{'=' * 50}") + print(f"✓ Duplicate removal complete!") + print(f" Total duplicates removed: {removed_count}") + + # Verify no duplicates remain + cursor.execute(""" + SELECT COUNT(*) + FROM ( + SELECT id, corpora, `set`, COUNT(*) as count + FROM arc_puzzles + GROUP BY id, corpora, `set` + HAVING COUNT(*) > 1 + ) as dups + """) + + remaining_dups = cursor.fetchone()[0] + if remaining_dups > 0: + print(f"\n⚠ Warning: {remaining_dups} duplicate sets still remain") + else: + print(f"\n✓ No duplicates remaining!") + + # Show final counts + cursor.execute("SELECT COUNT(*) FROM arc_puzzles") + final_count = cursor.fetchone()[0] + print(f"\nFinal table size: {final_count} rows") + + cursor.execute(""" + SELECT corpora, COUNT(*) as count + FROM arc_puzzles + GROUP BY corpora + ORDER BY corpora + """) + print("\nBy corpora:") + for corpora, count in cursor.fetchall(): + print(f" {corpora}: {count}") + + except Exception as e: + connection.rollback() + print(f"\n✗ Error: {e}") + import traceback + traceback.print_exc() + return 1 + finally: + connection.close() + print(f"\n✓ Database connection closed") + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/scripts/upload_conceptarc_to_db.py b/scripts/upload_conceptarc_to_db.py new file mode 100755 index 0000000..cec9353 --- /dev/null +++ b/scripts/upload_conceptarc_to_db.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +""" +Upload ConceptArc puzzle data to MariaDB database +Reads credentials from .env file and inserts all ConceptArc JSON files +""" + +import json +import os +import sys +from pathlib import Path +import pymysql +from dotenv import load_dotenv + +# Force unbuffered output +sys.stdout.reconfigure(line_buffering=True) +sys.stderr.reconfigure(line_buffering=True) + +def load_env_config(): + """Load database configuration from .env file""" + load_dotenv() + + config = { + 'host': os.getenv('DB_HOST'), + 'user': os.getenv('DB_USER'), + 'password': os.getenv('DB_PASSWORD'), + 'database': os.getenv('DB_NAME'), + 'port': int(os.getenv('DB_PORT', 3306)), + 'charset': 'utf8mb4' + } + + return config + +def get_conceptarc_files(): + """Get all JSON files from ConceptArc subdirectories""" + base_dir = Path('data/ConceptArc') + + if not base_dir.exists(): + print(f"✗ ConceptArc directory not found: {base_dir}") + return [] + + # Get all subdirectories (concept categories) + categories = [d for d in base_dir.iterdir() if d.is_dir()] + + all_files = [] + for category in sorted(categories): + json_files = sorted(category.glob('*.json')) + all_files.extend(json_files) + + return all_files + +def insert_puzzle(cursor, file_path): + """Insert a single ConceptArc puzzle into the database""" + # Extract puzzle ID from filename (e.g., "Count1.json" -> "Count1") + puzzle_id = file_path.stem + + # Extract category from parent directory (e.g., "Count", "Center", etc.) + category = file_path.parent.name + + # Read JSON content + with open(file_path, 'r') as f: + json_content = f.read().strip() + + # Verify JSON is valid and extract solution + try: + puzzle_data = json.loads(json_content) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in {file_path}: {e}") + + # Extract all solutions from test cases + # ConceptArc has multiple test cases, each with their own output + solution = None + if 'test' in puzzle_data and len(puzzle_data['test']) > 0: + # Collect all test outputs into an array + all_outputs = [] + for test_case in puzzle_data['test']: + if 'output' in test_case: + all_outputs.append(test_case['output']) + + # Store as JSON array if there are any outputs + if all_outputs: + solution = json.dumps(all_outputs) + + # Insert into arc_puzzles table + # id, corpora, set, difficulty, version + # Note: `set` is a reserved keyword, so we escape it with backticks + sql_puzzles = """ + INSERT INTO arc_puzzles (id, corpora, `set`, difficulty, version) + VALUES (%s, %s, %s, %s, %s) + """ + cursor.execute(sql_puzzles, (puzzle_id, 'ConceptArc', category, None, None)) + + # Insert into arc_jsons table + # arc_puzzle_id, json, solution + sql_jsons = """ + INSERT INTO arc_jsons (arc_puzzle_id, json, solution) + VALUES (%s, %s, %s) + """ + cursor.execute(sql_jsons, (puzzle_id, json_content, solution)) + + return puzzle_id, category + +def main(): + print("ConceptArc Data Upload to MariaDB") + print("=" * 50) + + # Load configuration + try: + config = load_env_config() + print(f"✓ Loaded configuration from .env") + print(f" Host: {config['host']}") + print(f" Database: {config['database']}") + print(f" User: {config['user']}") + except Exception as e: + print(f"✗ Error loading configuration: {e}") + return 1 + + # Get all ConceptArc files + all_files = get_conceptarc_files() + if not all_files: + print(f"✗ No ConceptArc JSON files found") + return 1 + + print(f"✓ Found {len(all_files)} ConceptArc JSON files") + + # Count files per category + from collections import defaultdict + category_counts = defaultdict(int) + for file_path in all_files: + category_counts[file_path.parent.name] += 1 + + print(f"\nBreakdown by category:") + for category, count in sorted(category_counts.items()): + print(f" - {category}: {count} files") + + # Connect to database + try: + print(f"\nConnecting to database...") + connection = pymysql.connect(**config) + print(f"✓ Connected successfully") + except Exception as e: + print(f"✗ Database connection failed: {e}") + return 1 + + try: + cursor = connection.cursor() + + # Check if tables exist + cursor.execute("SHOW TABLES LIKE 'arc_puzzles'") + if not cursor.fetchone(): + print(f"✗ Table 'arc_puzzles' does not exist") + return 1 + + cursor.execute("SHOW TABLES LIKE 'arc_jsons'") + if not cursor.fetchone(): + print(f"✗ Table 'arc_jsons' does not exist") + return 1 + + # Get current counts + cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'ConceptArc'") + initial_puzzles_count = cursor.fetchone()[0] + cursor.execute("SELECT COUNT(*) FROM arc_jsons WHERE arc_puzzle_id LIKE '%Count%' OR arc_puzzle_id LIKE '%Center%'") + initial_jsons_count = cursor.fetchone()[0] + + print(f"✓ Tables exist") + print(f" Current ConceptArc puzzles in arc_puzzles: {initial_puzzles_count}") + print(f" Current ConceptArc-like entries in arc_jsons: {initial_jsons_count}") + + # Ask for confirmation (unless --yes flag is provided) + if '--yes' not in sys.argv: + print(f"\n⚠ About to insert {len(all_files)} ConceptArc records") + response = input("Continue? (yes/no): ").strip().lower() + if response not in ['yes', 'y']: + print("Upload cancelled") + return 0 + else: + print(f"\n⚠ About to insert {len(all_files)} records (auto-confirmed with --yes flag)") + + print(f"\nInserting records...") + inserted = 0 + errors = 0 + category_inserted = defaultdict(int) + + for i, file_path in enumerate(all_files, 1): + try: + puzzle_id, category = insert_puzzle(cursor, file_path) + inserted += 1 + category_inserted[category] += 1 + + # Show progress every 20 records + if i % 20 == 0 or i == len(all_files): + print(f" Progress: {i}/{len(all_files)} ({inserted} inserted, {errors} errors)") + + except pymysql.IntegrityError as e: + # Likely duplicate key + if "Duplicate entry" in str(e): + errors += 1 + if errors <= 5: # Only show first 5 errors + print(f" ⚠ Duplicate: {file_path.stem} ({file_path.parent.name})") + else: + raise + except Exception as e: + errors += 1 + print(f" ✗ Error with {file_path.stem}: {e}") + if errors > 10: + print(f" Too many errors, stopping...") + break + + # Commit the transaction + connection.commit() + + # Get final counts + cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'ConceptArc'") + final_puzzles_count = cursor.fetchone()[0] + cursor.execute("SELECT COUNT(*) FROM arc_jsons") + final_jsons_count = cursor.fetchone()[0] + + print(f"\n{'=' * 50}") + print(f"✓ Upload complete!") + print(f" Successfully inserted: {inserted}") + print(f" Errors/duplicates: {errors}") + print(f" ConceptArc puzzles: {initial_puzzles_count} → {final_puzzles_count} (+{final_puzzles_count - initial_puzzles_count})") + + print(f"\nInserted by category:") + for category, count in sorted(category_inserted.items()): + print(f" - {category}: {count} puzzles") + + except Exception as e: + connection.rollback() + print(f"\n✗ Error during upload: {e}") + import traceback + traceback.print_exc() + return 1 + finally: + connection.close() + print(f"\n✓ Database connection closed") + + return 0 + +if __name__ == '__main__': + sys.exit(main()) diff --git a/scripts/verify_conceptarc_upload.py b/scripts/verify_conceptarc_upload.py new file mode 100755 index 0000000..c15db91 --- /dev/null +++ b/scripts/verify_conceptarc_upload.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +""" +Verify ConceptArc data was uploaded correctly to the database +""" + +import os +import json +import pymysql +from dotenv import load_dotenv + +def load_env_config(): + """Load database configuration from .env file""" + load_dotenv() + return { + 'host': os.getenv('DB_HOST'), + 'user': os.getenv('DB_USER'), + 'password': os.getenv('DB_PASSWORD'), + 'database': os.getenv('DB_NAME'), + 'port': int(os.getenv('DB_PORT', 3306)), + 'charset': 'utf8mb4' + } + +def main(): + print("Verifying ConceptArc Upload") + print("=" * 50) + + config = load_env_config() + connection = pymysql.connect(**config) + cursor = connection.cursor() + + # Check arc_puzzles table + cursor.execute(""" + SELECT `set`, COUNT(*) + FROM arc_puzzles + WHERE corpora = 'ConceptArc' + GROUP BY `set` + ORDER BY `set` + """) + + print("\nConceptArc puzzles by category (from arc_puzzles):") + total = 0 + for category, count in cursor.fetchall(): + print(f" {category}: {count} puzzles") + total += count + print(f" TOTAL: {total} puzzles") + + # Check arc_jsons table + cursor.execute(""" + SELECT COUNT(*) + FROM arc_jsons aj + JOIN arc_puzzles ap ON aj.arc_puzzle_id = ap.id + WHERE ap.corpora = 'ConceptArc' + """) + json_count = cursor.fetchone()[0] + print(f"\nConceptArc entries in arc_jsons: {json_count}") + + # Sample some puzzles + cursor.execute(""" + SELECT ap.id, ap.corpora, ap.`set`, aj.json, aj.solution + FROM arc_puzzles ap + JOIN arc_jsons aj ON ap.id = aj.arc_puzzle_id + WHERE ap.corpora = 'ConceptArc' + LIMIT 3 + """) + + print("\n" + "=" * 50) + print("Sample ConceptArc entries:") + print("=" * 50) + + for row in cursor.fetchall(): + puzzle_id, corpora, category, json_data, solution = row + print(f"\nPuzzle ID: {puzzle_id}") + print(f" Corpora: {corpora}") + print(f" Category: {category}") + print(f" JSON length: {len(json_data)} chars") + print(f" Has solution: {'Yes' if solution else 'No'}") + + if solution: + sol = json.loads(solution) + print(f" Solution dimensions: {len(sol)}x{len(sol[0]) if sol else 0}") + + connection.close() + print("\n✓ Verification complete") + +if __name__ == '__main__': + main() diff --git a/scripts/verify_solution_format.py b/scripts/verify_solution_format.py new file mode 100644 index 0000000..84139e4 --- /dev/null +++ b/scripts/verify_solution_format.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +""" +Verify that solutions are stored correctly for different corpora +- V1/V2/evaluation: Single grid (one test case) +- ConceptArc: Array of grids (multiple test cases) +""" + +import os +import json +import pymysql +from dotenv import load_dotenv + +def load_env_config(): + """Load database configuration from .env file""" + load_dotenv() + return { + 'host': os.getenv('DB_HOST'), + 'user': os.getenv('DB_USER'), + 'password': os.getenv('DB_PASSWORD'), + 'database': os.getenv('DB_NAME'), + 'port': int(os.getenv('DB_PORT', 3306)), + 'charset': 'utf8mb4' + } + +def check_solution_format(cursor, corpora_name, expected_test_count): + """Check solution format for a specific corpora""" + print(f"\n{'='*50}") + print(f"Checking {corpora_name} puzzles:") + print('='*50) + + cursor.execute(f""" + SELECT aj.arc_puzzle_id, aj.json, aj.solution + FROM arc_jsons aj + JOIN arc_puzzles ap ON aj.arc_puzzle_id = ap.id + WHERE ap.corpora = %s + LIMIT 5 + """, (corpora_name,)) + + results = cursor.fetchall() + + if not results: + print(f"No {corpora_name} puzzles found") + return + + mismatches = [] + + for puzzle_id, json_str, solution in results: + puzzle_data = json.loads(json_str) + test_count = len(puzzle_data.get('test', [])) + + if not solution: + print(f"⚠ {puzzle_id}: No solution stored!") + continue + + sol = json.loads(solution) + + # Determine solution structure + if isinstance(sol, list) and len(sol) > 0: + # Check if it's array of grids or single grid + if isinstance(sol[0], list) and len(sol[0]) > 0 and isinstance(sol[0][0], list): + # Array of grids (ConceptArc style) + sol_count = len(sol) + structure = f"Array of {sol_count} grids" + else: + # Single grid (regular ARC style) + sol_count = 1 + structure = f"Single grid ({len(sol)}x{len(sol[0]) if sol else 0})" + else: + structure = "Unknown format" + sol_count = 0 + + match = "✓" if sol_count == test_count else "✗" + print(f"{match} {puzzle_id}: {test_count} tests, {structure}") + + if sol_count != test_count: + mismatches.append((puzzle_id, test_count, sol_count)) + + if mismatches: + print(f"\n⚠ Found {len(mismatches)} mismatches:") + for pid, expected, actual in mismatches: + print(f" {pid}: Expected {expected} solutions, got {actual}") + else: + print(f"\n✓ All solutions match their test counts!") + +def main(): + print("Verifying Solution Formats") + print("=" * 50) + + config = load_env_config() + connection = pymysql.connect(**config) + cursor = connection.cursor() + + try: + # Check different corpora + check_solution_format(cursor, "V1", 1) + check_solution_format(cursor, "V2", 1) + check_solution_format(cursor, "evaluation", 1) + check_solution_format(cursor, "ConceptArc", 3) + + # Summary stats + print(f"\n{'='*50}") + print("Summary by corpora:") + print('='*50) + + cursor.execute(""" + SELECT ap.corpora, + COUNT(*) as total, + COUNT(aj.solution) as with_solution + FROM arc_puzzles ap + JOIN arc_jsons aj ON ap.id = aj.arc_puzzle_id + GROUP BY ap.corpora + ORDER BY ap.corpora + """) + + for corpora, total, with_sol in cursor.fetchall(): + print(f" {corpora}: {with_sol}/{total} have solutions") + + except Exception as e: + print(f"\n✗ Error: {e}") + import traceback + traceback.print_exc() + return 1 + finally: + connection.close() + print(f"\n✓ Database connection closed") + + return 0 + +if __name__ == '__main__': + import sys + sys.exit(main()) diff --git a/todo.md b/todo.md index 8a016de..da580ee 100644 --- a/todo.md +++ b/todo.md @@ -1,4 +1,14 @@ -Add ConceptARC corpus +Add ConceptARC Corpus + - to the Repository + - To the DB + - Categorize them in the DB + - Fix the Solution extraction method that is different from the other Corpora + +Interface: +- Remove the Header frame, make it a single frame interface to increase the + +DB: +- Make the View with Skills, Category etc. user Inputs Puzzle Assignment