ConceptARC db upload

This commit is contained in:
bmachado
2025-11-05 13:28:23 +00:00
parent aec95ed954
commit fa5f12fd47
10 changed files with 1061 additions and 1 deletions

10
.env Normal file
View File

@ -0,0 +1,10 @@
# Copy this file to .env and fill in the values
# Database configuration
DB_HOST=mariadb.vialink.com.br
DB_USER=bp_solver
DB_PASSWORD=A0bn%3H5sHl7C8Jk
DB_NAME=arc
DB_PORT=3306
# Add other secrets here as needed
# e.g., API keys: OPENAI_API_KEY=...

View File

@ -0,0 +1,83 @@
#!/usr/bin/env python3
"""
Check arc_puzzles table for ConceptArc entries
"""
import os
import pymysql
from dotenv import load_dotenv
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
return {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
def main():
print("Checking arc_puzzles table")
print("=" * 50)
config = load_env_config()
connection = pymysql.connect(**config)
cursor = connection.cursor()
# Check total counts by corpora
cursor.execute("""
SELECT corpora, COUNT(*) as count
FROM arc_puzzles
GROUP BY corpora
ORDER BY corpora
""")
print("\nTotal puzzles by corpora:")
for corpora, count in cursor.fetchall():
print(f" {corpora}: {count}")
# Check ConceptArc entries in detail
cursor.execute("""
SELECT id, corpora, `set`, difficulty, version
FROM arc_puzzles
WHERE corpora = 'ConceptArc'
ORDER BY `set`, id
LIMIT 20
""")
print("\n" + "=" * 50)
print("First 20 ConceptArc entries in arc_puzzles:")
print("=" * 50)
print(f"{'ID':<20} {'Corpora':<15} {'Set':<20} {'Difficulty':<12} {'Version':<10}")
print("-" * 80)
rows = cursor.fetchall()
if rows:
for row in rows:
puzzle_id, corpora, set_name, difficulty, version = row
print(f"{puzzle_id:<20} {corpora:<15} {set_name:<20} {str(difficulty):<12} {str(version):<10}")
else:
print("No ConceptArc entries found!")
# Count by set
cursor.execute("""
SELECT `set`, COUNT(*) as count
FROM arc_puzzles
WHERE corpora = 'ConceptArc'
GROUP BY `set`
ORDER BY `set`
""")
print("\n" + "=" * 50)
print("ConceptArc puzzles by category:")
print("=" * 50)
for set_name, count in cursor.fetchall():
print(f" {set_name}: {count}")
connection.close()
if __name__ == '__main__':
main()

117
scripts/check_schema.py Executable file
View File

@ -0,0 +1,117 @@
#!/usr/bin/env python3
"""
Check the database schema for arc_puzzles table
"""
import os
import sys
import pymysql
from dotenv import load_dotenv
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
config = {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
return config
def main():
print("Checking Database Schema")
print("=" * 50)
# Load configuration
try:
config = load_env_config()
print(f"✓ Loaded configuration from .env")
print(f" Host: {config['host']}")
print(f" Database: {config['database']}")
print(f" User: {config['user']}")
except Exception as e:
print(f"✗ Error loading configuration: {e}")
return 1
# Connect to database
try:
print(f"\nConnecting to database...")
connection = pymysql.connect(**config)
print(f"✓ Connected successfully")
except Exception as e:
print(f"✗ Database connection failed: {e}")
return 1
try:
cursor = connection.cursor()
# Show all tables
print("\n" + "=" * 50)
print("TABLES IN DATABASE:")
print("=" * 50)
cursor.execute("SHOW TABLES")
tables = cursor.fetchall()
for table in tables:
print(f" - {table[0]}")
# Check for arc_puzzles table
cursor.execute("SHOW TABLES LIKE 'arc_puzzles'")
if cursor.fetchone():
print("\n" + "=" * 50)
print("SCHEMA FOR 'arc_puzzles' TABLE:")
print("=" * 50)
cursor.execute("DESCRIBE arc_puzzles")
columns = cursor.fetchall()
for col in columns:
print(f" {col[0]:<20} {col[1]:<20} Null:{col[2]} Key:{col[3]} Default:{col[4]}")
# Get sample data
cursor.execute("SELECT * FROM arc_puzzles LIMIT 3")
print("\n" + "=" * 50)
print("SAMPLE DATA (first 3 rows):")
print("=" * 50)
rows = cursor.fetchall()
if rows:
# Get column names
cursor.execute("DESCRIBE arc_puzzles")
columns = [col[0] for col in cursor.fetchall()]
print(" Columns:", ", ".join(columns))
for i, row in enumerate(rows, 1):
print(f"\n Row {i}:")
for col_name, value in zip(columns, row):
if col_name == 'json':
print(f" {col_name}: [JSON data, length={len(str(value))}]")
else:
print(f" {col_name}: {value}")
else:
print(" (No data in table)")
else:
print("\n✗ Table 'arc_puzzles' does not exist")
# Check for arc_jsons table
cursor.execute("SHOW TABLES LIKE 'arc_jsons'")
if cursor.fetchone():
print("\n" + "=" * 50)
print("SCHEMA FOR 'arc_jsons' TABLE:")
print("=" * 50)
cursor.execute("DESCRIBE arc_jsons")
columns = cursor.fetchall()
for col in columns:
print(f" {col[0]:<20} {col[1]:<20} Null:{col[2]} Key:{col[3]} Default:{col[4]}")
except Exception as e:
print(f"\n✗ Error querying database: {e}")
return 1
finally:
connection.close()
print(f"\n✓ Database connection closed")
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,82 @@
#!/usr/bin/env python3
"""
Find duplicate puzzle IDs in arc_puzzles table
"""
import os
import pymysql
from dotenv import load_dotenv
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
return {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
def main():
print("Finding Duplicate IDs in arc_puzzles")
print("=" * 50)
config = load_env_config()
connection = pymysql.connect(**config)
cursor = connection.cursor()
# Find all duplicate IDs
cursor.execute("""
SELECT id, corpora, `set`, COUNT(*) as count
FROM arc_puzzles
GROUP BY id, corpora, `set`
HAVING COUNT(*) > 1
ORDER BY count DESC, id
""")
duplicates = cursor.fetchall()
if duplicates:
print(f"\n⚠ Found {len(duplicates)} duplicate entries:")
print(f"{'ID':<20} {'Corpora':<15} {'Set':<20} {'Count':<10}")
print("-" * 70)
for puzzle_id, corpora, set_name, count in duplicates:
print(f"{puzzle_id:<20} {corpora:<15} {set_name:<20} {count:<10}")
# Calculate totals
total_duplicates = sum(count - 1 for _, _, _, count in duplicates)
print(f"\nTotal duplicate rows to remove: {total_duplicates}")
# Check ConceptArc specifically
conceptarc_dups = [d for d in duplicates if d[1] == 'ConceptArc']
if conceptarc_dups:
print(f"\nConceptArc duplicates: {len(conceptarc_dups)} unique IDs")
conceptarc_dup_count = sum(count - 1 for _, _, _, count in conceptarc_dups)
print(f"ConceptArc duplicate rows to remove: {conceptarc_dup_count}")
else:
print("\n✓ No duplicates found!")
# Show total counts
print("\n" + "=" * 50)
print("Current table statistics:")
print("=" * 50)
cursor.execute("SELECT COUNT(*) FROM arc_puzzles")
total = cursor.fetchone()[0]
print(f"Total rows in arc_puzzles: {total}")
cursor.execute("""
SELECT corpora, COUNT(*) as count
FROM arc_puzzles
GROUP BY corpora
ORDER BY corpora
""")
print("\nBy corpora:")
for corpora, count in cursor.fetchall():
print(f" {corpora}: {count}")
connection.close()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""
Fix ConceptArc solutions in the database to include ALL test outputs, not just the last one
"""
import json
import os
import sys
from pathlib import Path
import pymysql
from dotenv import load_dotenv
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
return {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
def main():
print("Fixing ConceptArc Solutions")
print("=" * 50)
config = load_env_config()
connection = pymysql.connect(**config)
cursor = connection.cursor()
try:
# Get all ConceptArc entries
cursor.execute("""
SELECT aj.id, aj.arc_puzzle_id, aj.json, aj.solution
FROM arc_jsons aj
JOIN arc_puzzles ap ON aj.arc_puzzle_id = ap.id
WHERE ap.corpora = 'ConceptArc'
ORDER BY aj.arc_puzzle_id
""")
entries = cursor.fetchall()
print(f"Found {len(entries)} ConceptArc entries to check")
if not entries:
print("No ConceptArc entries found!")
return 0
# Ask for confirmation
if '--yes' not in sys.argv:
response = input(f"\nUpdate solutions for {len(entries)} entries? (yes/no): ").strip().lower()
if response not in ['yes', 'y']:
print("Operation cancelled")
return 0
else:
print(f"Updating solutions (auto-confirmed with --yes flag)")
updated = 0
errors = 0
skipped = 0
print("\nProcessing entries...")
for row_id, puzzle_id, json_str, current_solution in entries:
try:
# Parse the puzzle JSON
puzzle_data = json.loads(json_str)
# Extract all test outputs
all_outputs = []
if 'test' in puzzle_data:
for test_case in puzzle_data['test']:
if 'output' in test_case:
all_outputs.append(test_case['output'])
if not all_outputs:
print(f"{puzzle_id}: No test outputs found")
skipped += 1
continue
# Create new solution as array of all outputs
new_solution = json.dumps(all_outputs)
# Check if it's different from current
if current_solution == new_solution:
skipped += 1
continue
# Update the solution
cursor.execute("""
UPDATE arc_jsons
SET solution = %s
WHERE id = %s
""", (new_solution, row_id))
updated += 1
if updated % 20 == 0:
print(f" Updated: {updated}/{len(entries)}")
except Exception as e:
errors += 1
print(f" ✗ Error with {puzzle_id}: {e}")
if errors > 10:
print("Too many errors, stopping...")
break
# Commit changes
connection.commit()
print(f"\n{'=' * 50}")
print(f"✓ Solution update complete!")
print(f" Updated: {updated}")
print(f" Skipped (unchanged): {skipped}")
print(f" Errors: {errors}")
# Show a sample of updated solutions
if updated > 0:
cursor.execute("""
SELECT aj.arc_puzzle_id, aj.solution
FROM arc_jsons aj
JOIN arc_puzzles ap ON aj.arc_puzzle_id = ap.id
WHERE ap.corpora = 'ConceptArc'
LIMIT 3
""")
print(f"\n{'=' * 50}")
print("Sample updated entries:")
print("=" * 50)
for puzzle_id, solution in cursor.fetchall():
if solution:
sol_data = json.loads(solution)
print(f"\n{puzzle_id}:")
print(f" Number of test outputs: {len(sol_data)}")
if isinstance(sol_data, list) and len(sol_data) > 0:
first_output = sol_data[0]
if isinstance(first_output, list):
print(f" First output dimensions: {len(first_output)}x{len(first_output[0]) if first_output else 0}")
except Exception as e:
connection.rollback()
print(f"\n✗ Error: {e}")
import traceback
traceback.print_exc()
return 1
finally:
connection.close()
print(f"\n✓ Database connection closed")
return 0
if __name__ == '__main__':
sys.exit(main())

146
scripts/remove_duplicates.py Executable file
View File

@ -0,0 +1,146 @@
#!/usr/bin/env python3
"""
Remove duplicate entries from arc_puzzles table
Keeps only the first occurrence of each unique (id, corpora, set) combination
"""
import os
import sys
import pymysql
from dotenv import load_dotenv
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
return {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
def main():
print("Removing Duplicates from arc_puzzles Table")
print("=" * 50)
config = load_env_config()
connection = pymysql.connect(**config)
cursor = connection.cursor()
try:
# First, check if arc_puzzles has a primary key or unique identifier
cursor.execute("DESCRIBE arc_puzzles")
columns = cursor.fetchall()
print("\nTable structure:")
for col in columns:
print(f" {col[0]:<20} {col[1]:<20} Key:{col[3]}")
# Find duplicates
cursor.execute("""
SELECT id, corpora, `set`, COUNT(*) as count
FROM arc_puzzles
GROUP BY id, corpora, `set`
HAVING COUNT(*) > 1
ORDER BY id
""")
duplicates = cursor.fetchall()
if not duplicates:
print("\n✓ No duplicates found!")
return 0
print(f"\nFound {len(duplicates)} sets of duplicates")
total_to_remove = sum(count - 1 for _, _, _, count in duplicates)
print(f"Total rows to remove: {total_to_remove}")
# Ask for confirmation
if '--yes' not in sys.argv:
print(f"\n⚠ This will delete {total_to_remove} duplicate rows")
response = input("Continue? (yes/no): ").strip().lower()
if response not in ['yes', 'y']:
print("Operation cancelled")
return 0
else:
print(f"\n⚠ Deleting {total_to_remove} duplicate rows (auto-confirmed with --yes flag)")
# For each duplicate set, keep only one and delete the rest
# Since there's no auto-increment primary key, we'll use a different approach
# We'll create a temporary table with unique entries, then replace the original
print("\nRemoving duplicates...")
removed_count = 0
for puzzle_id, corpora, set_name, count in duplicates:
if count > 1:
# Keep one, delete the extras
# We delete (count - 1) duplicates
delete_count = count - 1
# Delete using LIMIT to remove only the extra copies
cursor.execute("""
DELETE FROM arc_puzzles
WHERE id = %s AND corpora = %s AND `set` = %s
LIMIT %s
""", (puzzle_id, corpora, set_name, delete_count))
removed_count += cursor.rowcount
if removed_count % 50 == 0:
print(f" Removed {removed_count}/{total_to_remove} duplicates...")
# Commit the changes
connection.commit()
print(f"\n{'=' * 50}")
print(f"✓ Duplicate removal complete!")
print(f" Total duplicates removed: {removed_count}")
# Verify no duplicates remain
cursor.execute("""
SELECT COUNT(*)
FROM (
SELECT id, corpora, `set`, COUNT(*) as count
FROM arc_puzzles
GROUP BY id, corpora, `set`
HAVING COUNT(*) > 1
) as dups
""")
remaining_dups = cursor.fetchone()[0]
if remaining_dups > 0:
print(f"\n⚠ Warning: {remaining_dups} duplicate sets still remain")
else:
print(f"\n✓ No duplicates remaining!")
# Show final counts
cursor.execute("SELECT COUNT(*) FROM arc_puzzles")
final_count = cursor.fetchone()[0]
print(f"\nFinal table size: {final_count} rows")
cursor.execute("""
SELECT corpora, COUNT(*) as count
FROM arc_puzzles
GROUP BY corpora
ORDER BY corpora
""")
print("\nBy corpora:")
for corpora, count in cursor.fetchall():
print(f" {corpora}: {count}")
except Exception as e:
connection.rollback()
print(f"\n✗ Error: {e}")
import traceback
traceback.print_exc()
return 1
finally:
connection.close()
print(f"\n✓ Database connection closed")
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""
Upload ConceptArc puzzle data to MariaDB database
Reads credentials from .env file and inserts all ConceptArc JSON files
"""
import json
import os
import sys
from pathlib import Path
import pymysql
from dotenv import load_dotenv
# Force unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
config = {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
return config
def get_conceptarc_files():
"""Get all JSON files from ConceptArc subdirectories"""
base_dir = Path('data/ConceptArc')
if not base_dir.exists():
print(f"✗ ConceptArc directory not found: {base_dir}")
return []
# Get all subdirectories (concept categories)
categories = [d for d in base_dir.iterdir() if d.is_dir()]
all_files = []
for category in sorted(categories):
json_files = sorted(category.glob('*.json'))
all_files.extend(json_files)
return all_files
def insert_puzzle(cursor, file_path):
"""Insert a single ConceptArc puzzle into the database"""
# Extract puzzle ID from filename (e.g., "Count1.json" -> "Count1")
puzzle_id = file_path.stem
# Extract category from parent directory (e.g., "Count", "Center", etc.)
category = file_path.parent.name
# Read JSON content
with open(file_path, 'r') as f:
json_content = f.read().strip()
# Verify JSON is valid and extract solution
try:
puzzle_data = json.loads(json_content)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in {file_path}: {e}")
# Extract all solutions from test cases
# ConceptArc has multiple test cases, each with their own output
solution = None
if 'test' in puzzle_data and len(puzzle_data['test']) > 0:
# Collect all test outputs into an array
all_outputs = []
for test_case in puzzle_data['test']:
if 'output' in test_case:
all_outputs.append(test_case['output'])
# Store as JSON array if there are any outputs
if all_outputs:
solution = json.dumps(all_outputs)
# Insert into arc_puzzles table
# id, corpora, set, difficulty, version
# Note: `set` is a reserved keyword, so we escape it with backticks
sql_puzzles = """
INSERT INTO arc_puzzles (id, corpora, `set`, difficulty, version)
VALUES (%s, %s, %s, %s, %s)
"""
cursor.execute(sql_puzzles, (puzzle_id, 'ConceptArc', category, None, None))
# Insert into arc_jsons table
# arc_puzzle_id, json, solution
sql_jsons = """
INSERT INTO arc_jsons (arc_puzzle_id, json, solution)
VALUES (%s, %s, %s)
"""
cursor.execute(sql_jsons, (puzzle_id, json_content, solution))
return puzzle_id, category
def main():
print("ConceptArc Data Upload to MariaDB")
print("=" * 50)
# Load configuration
try:
config = load_env_config()
print(f"✓ Loaded configuration from .env")
print(f" Host: {config['host']}")
print(f" Database: {config['database']}")
print(f" User: {config['user']}")
except Exception as e:
print(f"✗ Error loading configuration: {e}")
return 1
# Get all ConceptArc files
all_files = get_conceptarc_files()
if not all_files:
print(f"✗ No ConceptArc JSON files found")
return 1
print(f"✓ Found {len(all_files)} ConceptArc JSON files")
# Count files per category
from collections import defaultdict
category_counts = defaultdict(int)
for file_path in all_files:
category_counts[file_path.parent.name] += 1
print(f"\nBreakdown by category:")
for category, count in sorted(category_counts.items()):
print(f" - {category}: {count} files")
# Connect to database
try:
print(f"\nConnecting to database...")
connection = pymysql.connect(**config)
print(f"✓ Connected successfully")
except Exception as e:
print(f"✗ Database connection failed: {e}")
return 1
try:
cursor = connection.cursor()
# Check if tables exist
cursor.execute("SHOW TABLES LIKE 'arc_puzzles'")
if not cursor.fetchone():
print(f"✗ Table 'arc_puzzles' does not exist")
return 1
cursor.execute("SHOW TABLES LIKE 'arc_jsons'")
if not cursor.fetchone():
print(f"✗ Table 'arc_jsons' does not exist")
return 1
# Get current counts
cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'ConceptArc'")
initial_puzzles_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM arc_jsons WHERE arc_puzzle_id LIKE '%Count%' OR arc_puzzle_id LIKE '%Center%'")
initial_jsons_count = cursor.fetchone()[0]
print(f"✓ Tables exist")
print(f" Current ConceptArc puzzles in arc_puzzles: {initial_puzzles_count}")
print(f" Current ConceptArc-like entries in arc_jsons: {initial_jsons_count}")
# Ask for confirmation (unless --yes flag is provided)
if '--yes' not in sys.argv:
print(f"\n⚠ About to insert {len(all_files)} ConceptArc records")
response = input("Continue? (yes/no): ").strip().lower()
if response not in ['yes', 'y']:
print("Upload cancelled")
return 0
else:
print(f"\n⚠ About to insert {len(all_files)} records (auto-confirmed with --yes flag)")
print(f"\nInserting records...")
inserted = 0
errors = 0
category_inserted = defaultdict(int)
for i, file_path in enumerate(all_files, 1):
try:
puzzle_id, category = insert_puzzle(cursor, file_path)
inserted += 1
category_inserted[category] += 1
# Show progress every 20 records
if i % 20 == 0 or i == len(all_files):
print(f" Progress: {i}/{len(all_files)} ({inserted} inserted, {errors} errors)")
except pymysql.IntegrityError as e:
# Likely duplicate key
if "Duplicate entry" in str(e):
errors += 1
if errors <= 5: # Only show first 5 errors
print(f" ⚠ Duplicate: {file_path.stem} ({file_path.parent.name})")
else:
raise
except Exception as e:
errors += 1
print(f" ✗ Error with {file_path.stem}: {e}")
if errors > 10:
print(f" Too many errors, stopping...")
break
# Commit the transaction
connection.commit()
# Get final counts
cursor.execute("SELECT COUNT(*) FROM arc_puzzles WHERE corpora = 'ConceptArc'")
final_puzzles_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM arc_jsons")
final_jsons_count = cursor.fetchone()[0]
print(f"\n{'=' * 50}")
print(f"✓ Upload complete!")
print(f" Successfully inserted: {inserted}")
print(f" Errors/duplicates: {errors}")
print(f" ConceptArc puzzles: {initial_puzzles_count}{final_puzzles_count} (+{final_puzzles_count - initial_puzzles_count})")
print(f"\nInserted by category:")
for category, count in sorted(category_inserted.items()):
print(f" - {category}: {count} puzzles")
except Exception as e:
connection.rollback()
print(f"\n✗ Error during upload: {e}")
import traceback
traceback.print_exc()
return 1
finally:
connection.close()
print(f"\n✓ Database connection closed")
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@ -0,0 +1,86 @@
#!/usr/bin/env python3
"""
Verify ConceptArc data was uploaded correctly to the database
"""
import os
import json
import pymysql
from dotenv import load_dotenv
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
return {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
def main():
print("Verifying ConceptArc Upload")
print("=" * 50)
config = load_env_config()
connection = pymysql.connect(**config)
cursor = connection.cursor()
# Check arc_puzzles table
cursor.execute("""
SELECT `set`, COUNT(*)
FROM arc_puzzles
WHERE corpora = 'ConceptArc'
GROUP BY `set`
ORDER BY `set`
""")
print("\nConceptArc puzzles by category (from arc_puzzles):")
total = 0
for category, count in cursor.fetchall():
print(f" {category}: {count} puzzles")
total += count
print(f" TOTAL: {total} puzzles")
# Check arc_jsons table
cursor.execute("""
SELECT COUNT(*)
FROM arc_jsons aj
JOIN arc_puzzles ap ON aj.arc_puzzle_id = ap.id
WHERE ap.corpora = 'ConceptArc'
""")
json_count = cursor.fetchone()[0]
print(f"\nConceptArc entries in arc_jsons: {json_count}")
# Sample some puzzles
cursor.execute("""
SELECT ap.id, ap.corpora, ap.`set`, aj.json, aj.solution
FROM arc_puzzles ap
JOIN arc_jsons aj ON ap.id = aj.arc_puzzle_id
WHERE ap.corpora = 'ConceptArc'
LIMIT 3
""")
print("\n" + "=" * 50)
print("Sample ConceptArc entries:")
print("=" * 50)
for row in cursor.fetchall():
puzzle_id, corpora, category, json_data, solution = row
print(f"\nPuzzle ID: {puzzle_id}")
print(f" Corpora: {corpora}")
print(f" Category: {category}")
print(f" JSON length: {len(json_data)} chars")
print(f" Has solution: {'Yes' if solution else 'No'}")
if solution:
sol = json.loads(solution)
print(f" Solution dimensions: {len(sol)}x{len(sol[0]) if sol else 0}")
connection.close()
print("\n✓ Verification complete")
if __name__ == '__main__':
main()

View File

@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""
Verify that solutions are stored correctly for different corpora
- V1/V2/evaluation: Single grid (one test case)
- ConceptArc: Array of grids (multiple test cases)
"""
import os
import json
import pymysql
from dotenv import load_dotenv
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
return {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
def check_solution_format(cursor, corpora_name, expected_test_count):
"""Check solution format for a specific corpora"""
print(f"\n{'='*50}")
print(f"Checking {corpora_name} puzzles:")
print('='*50)
cursor.execute(f"""
SELECT aj.arc_puzzle_id, aj.json, aj.solution
FROM arc_jsons aj
JOIN arc_puzzles ap ON aj.arc_puzzle_id = ap.id
WHERE ap.corpora = %s
LIMIT 5
""", (corpora_name,))
results = cursor.fetchall()
if not results:
print(f"No {corpora_name} puzzles found")
return
mismatches = []
for puzzle_id, json_str, solution in results:
puzzle_data = json.loads(json_str)
test_count = len(puzzle_data.get('test', []))
if not solution:
print(f"{puzzle_id}: No solution stored!")
continue
sol = json.loads(solution)
# Determine solution structure
if isinstance(sol, list) and len(sol) > 0:
# Check if it's array of grids or single grid
if isinstance(sol[0], list) and len(sol[0]) > 0 and isinstance(sol[0][0], list):
# Array of grids (ConceptArc style)
sol_count = len(sol)
structure = f"Array of {sol_count} grids"
else:
# Single grid (regular ARC style)
sol_count = 1
structure = f"Single grid ({len(sol)}x{len(sol[0]) if sol else 0})"
else:
structure = "Unknown format"
sol_count = 0
match = "" if sol_count == test_count else ""
print(f"{match} {puzzle_id}: {test_count} tests, {structure}")
if sol_count != test_count:
mismatches.append((puzzle_id, test_count, sol_count))
if mismatches:
print(f"\n⚠ Found {len(mismatches)} mismatches:")
for pid, expected, actual in mismatches:
print(f" {pid}: Expected {expected} solutions, got {actual}")
else:
print(f"\n✓ All solutions match their test counts!")
def main():
print("Verifying Solution Formats")
print("=" * 50)
config = load_env_config()
connection = pymysql.connect(**config)
cursor = connection.cursor()
try:
# Check different corpora
check_solution_format(cursor, "V1", 1)
check_solution_format(cursor, "V2", 1)
check_solution_format(cursor, "evaluation", 1)
check_solution_format(cursor, "ConceptArc", 3)
# Summary stats
print(f"\n{'='*50}")
print("Summary by corpora:")
print('='*50)
cursor.execute("""
SELECT ap.corpora,
COUNT(*) as total,
COUNT(aj.solution) as with_solution
FROM arc_puzzles ap
JOIN arc_jsons aj ON ap.id = aj.arc_puzzle_id
GROUP BY ap.corpora
ORDER BY ap.corpora
""")
for corpora, total, with_sol in cursor.fetchall():
print(f" {corpora}: {with_sol}/{total} have solutions")
except Exception as e:
print(f"\n✗ Error: {e}")
import traceback
traceback.print_exc()
return 1
finally:
connection.close()
print(f"\n✓ Database connection closed")
return 0
if __name__ == '__main__':
import sys
sys.exit(main())

12
todo.md
View File

@ -1,4 +1,14 @@
Add ConceptARC corpus
Add ConceptARC Corpus
- to the Repository
- To the DB
- Categorize them in the DB
- Fix the Solution extraction method that is different from the other Corpora
Interface:
- Remove the Header frame, make it a single frame interface to increase the
DB:
- Make the View with Skills, Category etc.
user Inputs
Puzzle Assignment