initial
This commit is contained in:
54
scripts/check_and_fix_column.py
Normal file
54
scripts/check_and_fix_column.py
Normal file
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python3
|
||||
import pymysql
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
|
||||
config = {
|
||||
'host': os.getenv('DB_HOST'),
|
||||
'user': os.getenv('DB_USER'),
|
||||
'password': os.getenv('DB_PASSWORD'),
|
||||
'database': os.getenv('DB_NAME'),
|
||||
'port': int(os.getenv('DB_PORT', 3306)),
|
||||
}
|
||||
|
||||
conn = pymysql.connect(**config)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check column type
|
||||
cursor.execute("SHOW COLUMNS FROM arc_jsons WHERE Field = 'json'")
|
||||
result = cursor.fetchone()
|
||||
print(f"Current column type: {result[1]}")
|
||||
|
||||
# Check if we need to alter it
|
||||
if 'text' in result[1].lower() and 'medium' not in result[1].lower() and 'long' not in result[1].lower():
|
||||
print("\n⚠ Column is TEXT (max 65,535 bytes)")
|
||||
print(" One file (4a21e3da.json) is ~69KB and failed to insert")
|
||||
print("\nRecommended fix: ALTER TABLE arc_jsons MODIFY json MEDIUMTEXT;")
|
||||
print(" MEDIUMTEXT supports up to 16MB")
|
||||
|
||||
response = input("\nApply fix now? (yes/no): ").strip().lower()
|
||||
if response in ['yes', 'y']:
|
||||
print("\nAltering column to MEDIUMTEXT...")
|
||||
cursor.execute("ALTER TABLE arc_jsons MODIFY json MEDIUMTEXT")
|
||||
conn.commit()
|
||||
print("✓ Column altered successfully!")
|
||||
|
||||
# Now insert the failed record
|
||||
print("\nRe-inserting failed record (4a21e3da)...")
|
||||
with open('arc_data/evaluation/4a21e3da.json', 'r') as f:
|
||||
json_content = f.read().strip()
|
||||
|
||||
cursor.execute("INSERT INTO arc_jsons (id, json) VALUES (%s, %s)", ('4a21e3da', json_content))
|
||||
conn.commit()
|
||||
print("✓ Record inserted successfully!")
|
||||
|
||||
# Final count
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_jsons")
|
||||
count = cursor.fetchone()[0]
|
||||
print(f"\n✓ Total records in database: {count}")
|
||||
else:
|
||||
print(f"✓ Column type is sufficient: {result[1]}")
|
||||
|
||||
conn.close()
|
||||
222
scripts/extract_solutions.py
Normal file
222
scripts/extract_solutions.py
Normal file
@ -0,0 +1,222 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract test outputs from ARC puzzle JSON and store in solution column
|
||||
This script adds a 'solution' column and populates it with the test output grid
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import pymysql
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Force unbuffered output
|
||||
sys.stdout.reconfigure(line_buffering=True)
|
||||
sys.stderr.reconfigure(line_buffering=True)
|
||||
|
||||
|
||||
def load_env_config():
|
||||
"""Load database configuration from .env file"""
|
||||
load_dotenv()
|
||||
|
||||
config = {
|
||||
'host': os.getenv('DB_HOST'),
|
||||
'user': os.getenv('DB_USER'),
|
||||
'password': os.getenv('DB_PASSWORD'),
|
||||
'database': os.getenv('DB_NAME'),
|
||||
'port': int(os.getenv('DB_PORT', 3306)),
|
||||
'charset': 'utf8mb4'
|
||||
}
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def check_and_add_solution_column(cursor):
|
||||
"""Check if solution column exists, add it if not"""
|
||||
cursor.execute("""
|
||||
SELECT COLUMN_NAME
|
||||
FROM INFORMATION_SCHEMA.COLUMNS
|
||||
WHERE TABLE_SCHEMA = DATABASE()
|
||||
AND TABLE_NAME = 'arc_jsons'
|
||||
AND COLUMN_NAME = 'solution'
|
||||
""")
|
||||
|
||||
if cursor.fetchone():
|
||||
print("✓ Column 'solution' already exists")
|
||||
return True
|
||||
|
||||
print("Adding 'solution' column to arc_jsons table...")
|
||||
cursor.execute("ALTER TABLE arc_jsons ADD COLUMN solution JSON AFTER json")
|
||||
print("✓ Column 'solution' added successfully")
|
||||
return True
|
||||
|
||||
|
||||
def extract_test_output(json_content):
|
||||
"""Extract test output from puzzle JSON"""
|
||||
try:
|
||||
# Parse JSON (handle both string and object)
|
||||
if isinstance(json_content, str):
|
||||
puzzle_data = json.loads(json_content)
|
||||
else:
|
||||
puzzle_data = json_content
|
||||
|
||||
# Extract test output
|
||||
if puzzle_data.get('test') and len(puzzle_data['test']) > 0:
|
||||
test_case = puzzle_data['test'][0]
|
||||
if test_case.get('output'):
|
||||
return test_case['output']
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error parsing JSON: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def update_solution(cursor, puzzle_id, json_content):
|
||||
"""Extract test output and update solution column"""
|
||||
test_output = extract_test_output(json_content)
|
||||
|
||||
if test_output is None:
|
||||
return False, "No test output found"
|
||||
|
||||
try:
|
||||
# Convert to JSON string
|
||||
solution_json = json.dumps(test_output)
|
||||
|
||||
# Update database
|
||||
sql = "UPDATE arc_jsons SET solution = %s WHERE id = %s"
|
||||
cursor.execute(sql, (solution_json, puzzle_id))
|
||||
|
||||
return True, None
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def main():
|
||||
print("ARC Solution Extraction Tool")
|
||||
print("=" * 60)
|
||||
print("This script will:")
|
||||
print(" 1. Add a 'solution' column to arc_jsons (if needed)")
|
||||
print(" 2. Extract test outputs from JSON data")
|
||||
print(" 3. Store solutions in the new column")
|
||||
print("=" * 60)
|
||||
|
||||
# Load configuration
|
||||
try:
|
||||
config = load_env_config()
|
||||
print(f"\n✓ Loaded configuration from .env")
|
||||
print(f" Host: {config['host']}")
|
||||
print(f" Database: {config['database']}")
|
||||
print(f" User: {config['user']}")
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading configuration: {e}")
|
||||
return 1
|
||||
|
||||
# Connect to database
|
||||
try:
|
||||
print(f"\nConnecting to database...")
|
||||
connection = pymysql.connect(**config)
|
||||
print(f"✓ Connected successfully")
|
||||
except Exception as e:
|
||||
print(f"✗ Database connection failed: {e}")
|
||||
return 1
|
||||
|
||||
try:
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if table exists
|
||||
cursor.execute("SHOW TABLES LIKE 'arc_jsons'")
|
||||
if not cursor.fetchone():
|
||||
print(f"✗ Table 'arc_jsons' does not exist")
|
||||
return 1
|
||||
|
||||
# Get current count
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_jsons")
|
||||
total_count = cursor.fetchone()[0]
|
||||
print(f"✓ Table 'arc_jsons' found ({total_count} records)")
|
||||
|
||||
# Check/add solution column
|
||||
print()
|
||||
check_and_add_solution_column(cursor)
|
||||
connection.commit()
|
||||
|
||||
# Ask for confirmation (unless --yes flag is provided)
|
||||
if '--yes' not in sys.argv:
|
||||
print(f"\n⚠ About to process {total_count} records")
|
||||
response = input("Continue? (yes/no): ").strip().lower()
|
||||
if response not in ['yes', 'y']:
|
||||
print("Extraction cancelled")
|
||||
return 0
|
||||
else:
|
||||
print(f"\n⚠ About to process {total_count} records (auto-confirmed with --yes flag)")
|
||||
|
||||
# Fetch all records
|
||||
print(f"\nFetching records...")
|
||||
cursor.execute("SELECT id, json FROM arc_jsons")
|
||||
records = cursor.fetchall()
|
||||
print(f"✓ Retrieved {len(records)} records")
|
||||
|
||||
# Process each record
|
||||
print(f"\nProcessing records...")
|
||||
updated = 0
|
||||
errors = 0
|
||||
no_output = 0
|
||||
|
||||
for i, (puzzle_id, json_content) in enumerate(records, 1):
|
||||
success, error = update_solution(cursor, puzzle_id, json_content)
|
||||
|
||||
if success:
|
||||
updated += 1
|
||||
elif error == "No test output found":
|
||||
no_output += 1
|
||||
if no_output <= 5: # Show first 5 cases
|
||||
print(f" ⚠ No output: {puzzle_id}")
|
||||
else:
|
||||
errors += 1
|
||||
if errors <= 5: # Show first 5 errors
|
||||
print(f" ✗ Error {puzzle_id}: {error}")
|
||||
|
||||
# Show progress every 100 records
|
||||
if i % 100 == 0 or i == len(records):
|
||||
print(f" Progress: {i}/{len(records)} ({updated} updated, {no_output} no output, {errors} errors)")
|
||||
|
||||
# Commit the transaction
|
||||
connection.commit()
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"✓ Extraction complete!")
|
||||
print(f" Successfully updated: {updated}")
|
||||
print(f" No test output: {no_output}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" Total processed: {len(records)}")
|
||||
|
||||
# Show sample
|
||||
if updated > 0:
|
||||
print(f"\nSample record (first with solution):")
|
||||
cursor.execute("SELECT id, solution FROM arc_jsons WHERE solution IS NOT NULL LIMIT 1")
|
||||
sample = cursor.fetchone()
|
||||
if sample:
|
||||
sample_id, sample_solution = sample
|
||||
solution_data = json.loads(sample_solution)
|
||||
rows = len(solution_data)
|
||||
cols = len(solution_data[0]) if rows > 0 else 0
|
||||
print(f" ID: {sample_id}")
|
||||
print(f" Solution grid: {cols}×{rows}")
|
||||
print(f" First row: {solution_data[0][:10]}..." if cols > 10 else f" First row: {solution_data[0]}")
|
||||
|
||||
except Exception as e:
|
||||
connection.rollback()
|
||||
print(f"\n✗ Error during extraction: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
finally:
|
||||
connection.close()
|
||||
print(f"\n✓ Database connection closed")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
79
scripts/generate_sql.py
Normal file
79
scripts/generate_sql.py
Normal file
@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate SQL INSERT statements for ARC puzzle data
|
||||
Reads all JSON files from arc_data/training and arc_data/evaluation
|
||||
and creates INSERT statements for the arc_jsons table
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
def escape_sql_string(s):
|
||||
"""Escape single quotes for SQL"""
|
||||
return s.replace("'", "''").replace("\\", "\\\\")
|
||||
|
||||
def generate_insert_statement(file_path):
|
||||
"""Generate INSERT statement for a single JSON file"""
|
||||
# Extract ID from filename (without .json extension)
|
||||
puzzle_id = Path(file_path).stem
|
||||
|
||||
# Read JSON content
|
||||
with open(file_path, 'r') as f:
|
||||
json_content = f.read().strip()
|
||||
|
||||
# Escape the JSON string for SQL
|
||||
escaped_json = escape_sql_string(json_content)
|
||||
|
||||
# Generate INSERT statement
|
||||
sql = f"INSERT INTO arc_jsons (id, json) VALUES ('{puzzle_id}', '{escaped_json}');"
|
||||
|
||||
return sql
|
||||
|
||||
def main():
|
||||
base_dir = Path('arc_data')
|
||||
|
||||
# Collect all JSON files
|
||||
training_files = sorted(base_dir.glob('training/*.json'))
|
||||
evaluation_files = sorted(base_dir.glob('evaluation/*.json'))
|
||||
|
||||
all_files = training_files + evaluation_files
|
||||
|
||||
print(f"-- Found {len(training_files)} training files")
|
||||
print(f"-- Found {len(evaluation_files)} evaluation files")
|
||||
print(f"-- Total: {len(all_files)} files")
|
||||
print()
|
||||
|
||||
# Check if user wants preview mode
|
||||
preview_mode = '--preview' in sys.argv
|
||||
|
||||
if preview_mode:
|
||||
print("-- PREVIEW MODE: Showing first 5 INSERT statements")
|
||||
print()
|
||||
files_to_show = all_files[:5]
|
||||
else:
|
||||
print("-- Generating all INSERT statements...")
|
||||
print()
|
||||
files_to_show = all_files
|
||||
|
||||
# Generate INSERT statements
|
||||
for i, file_path in enumerate(files_to_show, 1):
|
||||
try:
|
||||
sql = generate_insert_statement(file_path)
|
||||
print(sql)
|
||||
except Exception as e:
|
||||
print(f"-- ERROR processing {file_path}: {e}", file=sys.stderr)
|
||||
|
||||
if preview_mode:
|
||||
print()
|
||||
print(f"-- ... and {len(all_files) - 5} more")
|
||||
print()
|
||||
print("-- To generate all statements, run: python3 generate_sql.py > insert_arc_data.sql")
|
||||
print("-- To see more preview: python3 generate_sql.py --preview")
|
||||
else:
|
||||
print()
|
||||
print(f"-- Successfully generated {len(all_files)} INSERT statements")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
51
scripts/generate_v1_csv.py
Normal file
51
scripts/generate_v1_csv.py
Normal file
@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate a CSV file with ARC V1 task IDs and their set (training/evaluation)
|
||||
"""
|
||||
|
||||
import csv
|
||||
|
||||
def main():
|
||||
# Read the training IDs
|
||||
with open('arc_v1_training_ids.txt', 'r') as f:
|
||||
training_ids = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# Read the evaluation IDs
|
||||
with open('arc_v1_evaluation_ids.txt', 'r') as f:
|
||||
evaluation_ids = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# Create CSV file
|
||||
with open('arc_v1_task_ids.csv', 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
|
||||
# Write header
|
||||
writer.writerow(['id', 'set'])
|
||||
|
||||
# Write training tasks
|
||||
for task_id in training_ids:
|
||||
writer.writerow([task_id, 'training'])
|
||||
|
||||
# Write evaluation tasks
|
||||
for task_id in evaluation_ids:
|
||||
writer.writerow([task_id, 'evaluation'])
|
||||
|
||||
print("=" * 80)
|
||||
print("ARC V1 Task IDs - CSV Export")
|
||||
print("=" * 80)
|
||||
print(f"\nTraining Tasks: {len(training_ids)}")
|
||||
print(f"Evaluation Tasks: {len(evaluation_ids)}")
|
||||
print(f"Total: {len(training_ids) + len(evaluation_ids)}")
|
||||
|
||||
print("\nFirst 10 rows (preview):")
|
||||
print("-" * 40)
|
||||
print("id,set")
|
||||
for task_id in training_ids[:5]:
|
||||
print(f"{task_id},training")
|
||||
for task_id in evaluation_ids[:5]:
|
||||
print(f"{task_id},evaluation")
|
||||
|
||||
print("\n✓ Saved to: arc_v1_task_ids.csv")
|
||||
print("=" * 80)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
71
scripts/generate_v1_list.py
Normal file
71
scripts/generate_v1_list.py
Normal file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate a comprehensive JSON file with official ARC V1 task IDs
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
def main():
|
||||
# Read the training IDs
|
||||
with open('arc_v1_training_ids.txt', 'r') as f:
|
||||
training_ids = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# Read the evaluation IDs
|
||||
with open('arc_v1_evaluation_ids.txt', 'r') as f:
|
||||
evaluation_ids = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# Create comprehensive JSON
|
||||
data = {
|
||||
"version": "ARC-AGI-1 (Official)",
|
||||
"source": "https://github.com/fchollet/ARC-AGI v1.0.2",
|
||||
"description": "Official ARC-AGI Version 1 task IDs - 400 training + 400 evaluation tasks",
|
||||
"total_tasks": len(training_ids) + len(evaluation_ids),
|
||||
"training": {
|
||||
"count": len(training_ids),
|
||||
"task_ids": training_ids
|
||||
},
|
||||
"evaluation": {
|
||||
"count": len(evaluation_ids),
|
||||
"task_ids": evaluation_ids
|
||||
},
|
||||
"all_task_ids": sorted(training_ids + evaluation_ids)
|
||||
}
|
||||
|
||||
# Save to JSON
|
||||
with open('arc_v1_official_task_ids.json', 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
print("=" * 80)
|
||||
print("ARC V1 Official Task IDs")
|
||||
print("=" * 80)
|
||||
print(f"\nTraining Tasks: {len(training_ids)}")
|
||||
print(f"Evaluation Tasks: {len(evaluation_ids)}")
|
||||
print(f"Total: {len(training_ids) + len(evaluation_ids)}")
|
||||
|
||||
print("\nFirst 10 Training IDs:")
|
||||
for task_id in training_ids[:10]:
|
||||
print(f" {task_id}")
|
||||
|
||||
print("\nFirst 10 Evaluation IDs:")
|
||||
for task_id in evaluation_ids[:10]:
|
||||
print(f" {task_id}")
|
||||
|
||||
print("\n✓ Saved to: arc_v1_official_task_ids.json")
|
||||
print("=" * 80)
|
||||
|
||||
# Also create a simple combined text file
|
||||
with open('arc_v1_all_ids.txt', 'w') as f:
|
||||
f.write("# ARC-AGI Version 1 Official Task IDs\n")
|
||||
f.write("# Source: https://github.com/fchollet/ARC-AGI v1.0.2\n")
|
||||
f.write(f"# Total: {len(training_ids) + len(evaluation_ids)} tasks\n\n")
|
||||
f.write("## Training Tasks (400)\n")
|
||||
for task_id in training_ids:
|
||||
f.write(f"{task_id}\n")
|
||||
f.write("\n## Evaluation Tasks (400)\n")
|
||||
for task_id in evaluation_ids:
|
||||
f.write(f"{task_id}\n")
|
||||
|
||||
print("✓ Saved to: arc_v1_all_ids.txt")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
76
scripts/generate_v2_csv.py
Normal file
76
scripts/generate_v2_csv.py
Normal file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate a CSV file with ARC V2 task IDs, set, and difficulty
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
|
||||
def main():
|
||||
# Read the training IDs
|
||||
with open('arc_v2_training_ids.txt', 'r') as f:
|
||||
training_ids = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# Read the evaluation IDs
|
||||
with open('arc_v2_evaluation_ids.txt', 'r') as f:
|
||||
evaluation_ids = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# Create CSV file
|
||||
with open('arc_v2_task_ids.csv', 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
|
||||
# Write header
|
||||
writer.writerow(['id', 'set', 'difficulty'])
|
||||
|
||||
# Write training tasks (training = easy per official labeling)
|
||||
for task_id in training_ids:
|
||||
writer.writerow([task_id, 'training', 'easy'])
|
||||
|
||||
# Write evaluation tasks (evaluation = hard per official labeling)
|
||||
for task_id in evaluation_ids:
|
||||
writer.writerow([task_id, 'evaluation', 'hard'])
|
||||
|
||||
print("=" * 80)
|
||||
print("ARC-AGI-2 Task IDs - CSV Export")
|
||||
print("=" * 80)
|
||||
print(f"\nTraining Tasks: {len(training_ids)} (labeled: easy)")
|
||||
print(f"Evaluation Tasks: {len(evaluation_ids)} (labeled: hard)")
|
||||
print(f"Total: {len(training_ids) + len(evaluation_ids)}")
|
||||
|
||||
print("\nFirst 10 rows (preview):")
|
||||
print("-" * 50)
|
||||
print("id,set,difficulty")
|
||||
for task_id in training_ids[:5]:
|
||||
print(f"{task_id},training,easy")
|
||||
for task_id in evaluation_ids[:5]:
|
||||
print(f"{task_id},evaluation,hard")
|
||||
|
||||
print("\n✓ Saved to: arc_v2_task_ids.csv")
|
||||
|
||||
# Also create a JSON version
|
||||
data = {
|
||||
"version": "ARC-AGI-2",
|
||||
"source": "https://github.com/arcprize/ARC-AGI-2",
|
||||
"description": "ARC-AGI Version 2 task IDs - 1000 training + 120 evaluation tasks",
|
||||
"total_tasks": len(training_ids) + len(evaluation_ids),
|
||||
"training": {
|
||||
"count": len(training_ids),
|
||||
"difficulty": "easy",
|
||||
"task_ids": training_ids
|
||||
},
|
||||
"evaluation": {
|
||||
"count": len(evaluation_ids),
|
||||
"difficulty": "hard",
|
||||
"task_ids": evaluation_ids
|
||||
},
|
||||
"all_task_ids": sorted(training_ids + evaluation_ids)
|
||||
}
|
||||
|
||||
with open('arc_v2_official_task_ids.json', 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
print("✓ Saved to: arc_v2_official_task_ids.json")
|
||||
print("=" * 80)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
77
scripts/list_v1_task_ids.py
Normal file
77
scripts/list_v1_task_ids.py
Normal file
@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate a list of all ARC V1 task IDs from the local dataset
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
|
||||
def get_task_ids_from_directory(directory):
|
||||
"""Get all task IDs (filenames without .json) from a directory"""
|
||||
task_ids = []
|
||||
if os.path.exists(directory):
|
||||
for filename in sorted(os.listdir(directory)):
|
||||
if filename.endswith('.json'):
|
||||
task_id = filename[:-5] # Remove .json extension
|
||||
task_ids.append(task_id)
|
||||
return task_ids
|
||||
|
||||
def main():
|
||||
# Get task IDs from training and evaluation directories
|
||||
training_dir = 'arc_data/training'
|
||||
evaluation_dir = 'arc_data/evaluation'
|
||||
|
||||
training_ids = get_task_ids_from_directory(training_dir)
|
||||
evaluation_ids = get_task_ids_from_directory(evaluation_dir)
|
||||
|
||||
print("ARC V1 Task IDs")
|
||||
print("=" * 80)
|
||||
print(f"\nTraining Tasks: {len(training_ids)} tasks")
|
||||
print(f"Evaluation Tasks: {len(evaluation_ids)} tasks")
|
||||
print(f"Total: {len(training_ids) + len(evaluation_ids)} tasks")
|
||||
|
||||
# Save to JSON file
|
||||
output_data = {
|
||||
"version": "ARC-AGI-1",
|
||||
"total_tasks": len(training_ids) + len(evaluation_ids),
|
||||
"training": {
|
||||
"count": len(training_ids),
|
||||
"task_ids": training_ids
|
||||
},
|
||||
"evaluation": {
|
||||
"count": len(evaluation_ids),
|
||||
"task_ids": evaluation_ids
|
||||
},
|
||||
"all_task_ids": sorted(training_ids + evaluation_ids)
|
||||
}
|
||||
|
||||
with open('arc_v1_task_ids.json', 'w') as f:
|
||||
json.dump(output_data, f, indent=2)
|
||||
|
||||
print(f"\n✓ Saved complete list to: arc_v1_task_ids.json")
|
||||
|
||||
# Also save a simple text list
|
||||
with open('arc_v1_task_ids.txt', 'w') as f:
|
||||
f.write("# ARC V1 Training Task IDs\n")
|
||||
for task_id in training_ids:
|
||||
f.write(f"{task_id}\n")
|
||||
f.write("\n# ARC V1 Evaluation Task IDs\n")
|
||||
for task_id in evaluation_ids:
|
||||
f.write(f"{task_id}\n")
|
||||
|
||||
print(f"✓ Saved text list to: arc_v1_task_ids.txt")
|
||||
|
||||
# Display first 10 from each set as preview
|
||||
print("\n" + "-" * 80)
|
||||
print("Preview - First 10 Training Task IDs:")
|
||||
for task_id in training_ids[:10]:
|
||||
print(f" {task_id}")
|
||||
|
||||
print("\nPreview - First 10 Evaluation Task IDs:")
|
||||
for task_id in evaluation_ids[:10]:
|
||||
print(f" {task_id}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
161
scripts/upload_to_db.py
Executable file
161
scripts/upload_to_db.py
Executable file
@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Upload ARC puzzle data to MariaDB database
|
||||
Reads credentials from .env file and inserts all JSON files
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import pymysql
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Force unbuffered output
|
||||
sys.stdout.reconfigure(line_buffering=True)
|
||||
sys.stderr.reconfigure(line_buffering=True)
|
||||
|
||||
def load_env_config():
|
||||
"""Load database configuration from .env file"""
|
||||
load_dotenv()
|
||||
|
||||
config = {
|
||||
'host': os.getenv('DB_HOST'),
|
||||
'user': os.getenv('DB_USER'),
|
||||
'password': os.getenv('DB_PASSWORD'),
|
||||
'database': os.getenv('DB_NAME'),
|
||||
'port': int(os.getenv('DB_PORT', 3306)),
|
||||
'charset': 'utf8mb4'
|
||||
}
|
||||
|
||||
return config
|
||||
|
||||
def get_all_json_files():
|
||||
"""Get all JSON files from training and evaluation directories"""
|
||||
base_dir = Path('arc_data')
|
||||
|
||||
training_files = sorted(base_dir.glob('training/*.json'))
|
||||
evaluation_files = sorted(base_dir.glob('evaluation/*.json'))
|
||||
|
||||
return training_files + evaluation_files
|
||||
|
||||
def insert_puzzle(cursor, file_path):
|
||||
"""Insert a single puzzle into the database"""
|
||||
# Extract ID from filename
|
||||
puzzle_id = file_path.stem
|
||||
|
||||
# Read JSON content
|
||||
with open(file_path, 'r') as f:
|
||||
json_content = f.read().strip()
|
||||
|
||||
# Insert into database using parameterized query (prevents SQL injection)
|
||||
sql = "INSERT INTO arc_jsons (id, json) VALUES (%s, %s)"
|
||||
cursor.execute(sql, (puzzle_id, json_content))
|
||||
|
||||
return puzzle_id
|
||||
|
||||
def main():
|
||||
print("ARC Data Upload to MariaDB")
|
||||
print("=" * 50)
|
||||
|
||||
# Load configuration
|
||||
try:
|
||||
config = load_env_config()
|
||||
print(f"✓ Loaded configuration from .env")
|
||||
print(f" Host: {config['host']}")
|
||||
print(f" Database: {config['database']}")
|
||||
print(f" User: {config['user']}")
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading configuration: {e}")
|
||||
return 1
|
||||
|
||||
# Get all files
|
||||
all_files = get_all_json_files()
|
||||
print(f"✓ Found {len(all_files)} JSON files")
|
||||
|
||||
# Connect to database
|
||||
try:
|
||||
print(f"\nConnecting to database...")
|
||||
connection = pymysql.connect(**config)
|
||||
print(f"✓ Connected successfully")
|
||||
except Exception as e:
|
||||
print(f"✗ Database connection failed: {e}")
|
||||
return 1
|
||||
|
||||
try:
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if table exists
|
||||
cursor.execute("SHOW TABLES LIKE 'arc_jsons'")
|
||||
if not cursor.fetchone():
|
||||
print(f"✗ Table 'arc_jsons' does not exist")
|
||||
return 1
|
||||
|
||||
# Get current count
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_jsons")
|
||||
initial_count = cursor.fetchone()[0]
|
||||
print(f"✓ Table 'arc_jsons' exists (current rows: {initial_count})")
|
||||
|
||||
# Ask for confirmation (unless --yes flag is provided)
|
||||
if '--yes' not in sys.argv:
|
||||
print(f"\n⚠ About to insert {len(all_files)} records")
|
||||
response = input("Continue? (yes/no): ").strip().lower()
|
||||
if response not in ['yes', 'y']:
|
||||
print("Upload cancelled")
|
||||
return 0
|
||||
else:
|
||||
print(f"\n⚠ About to insert {len(all_files)} records (auto-confirmed with --yes flag)")
|
||||
|
||||
print(f"\nInserting records...")
|
||||
inserted = 0
|
||||
errors = 0
|
||||
|
||||
for i, file_path in enumerate(all_files, 1):
|
||||
try:
|
||||
puzzle_id = insert_puzzle(cursor, file_path)
|
||||
inserted += 1
|
||||
|
||||
# Show progress every 100 records
|
||||
if i % 100 == 0 or i == len(all_files):
|
||||
print(f" Progress: {i}/{len(all_files)} ({inserted} inserted, {errors} errors)")
|
||||
|
||||
except pymysql.IntegrityError as e:
|
||||
# Likely duplicate key
|
||||
if "Duplicate entry" in str(e):
|
||||
errors += 1
|
||||
if errors <= 5: # Only show first 5 errors
|
||||
print(f" ⚠ Duplicate: {file_path.stem}")
|
||||
else:
|
||||
raise
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
print(f" ✗ Error with {file_path.stem}: {e}")
|
||||
if errors > 10:
|
||||
print(f" Too many errors, stopping...")
|
||||
break
|
||||
|
||||
# Commit the transaction
|
||||
connection.commit()
|
||||
|
||||
# Get final count
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_jsons")
|
||||
final_count = cursor.fetchone()[0]
|
||||
|
||||
print(f"\n{'=' * 50}")
|
||||
print(f"✓ Upload complete!")
|
||||
print(f" Successfully inserted: {inserted}")
|
||||
print(f" Errors/duplicates: {errors}")
|
||||
print(f" Database rows: {initial_count} → {final_count} (+{final_count - initial_count})")
|
||||
|
||||
except Exception as e:
|
||||
connection.rollback()
|
||||
print(f"\n✗ Error during upload: {e}")
|
||||
return 1
|
||||
finally:
|
||||
connection.close()
|
||||
print(f"\n✓ Database connection closed")
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
83
scripts/verify_solutions.py
Normal file
83
scripts/verify_solutions.py
Normal file
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Verify that solutions were extracted successfully
|
||||
Shows sample solutions from the database
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import pymysql
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
def load_env_config():
|
||||
"""Load database configuration from .env file"""
|
||||
load_dotenv()
|
||||
|
||||
config = {
|
||||
'host': os.getenv('DB_HOST'),
|
||||
'user': os.getenv('DB_USER'),
|
||||
'password': os.getenv('DB_PASSWORD'),
|
||||
'database': os.getenv('DB_NAME'),
|
||||
'port': int(os.getenv('DB_PORT', 3306)),
|
||||
'charset': 'utf8mb4'
|
||||
}
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def main():
|
||||
print("Solution Verification Tool")
|
||||
print("=" * 60)
|
||||
|
||||
# Load configuration and connect
|
||||
config = load_env_config()
|
||||
connection = pymysql.connect(**config)
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Get statistics
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_jsons")
|
||||
total_records = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_jsons WHERE solution IS NOT NULL")
|
||||
with_solutions = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM arc_jsons WHERE solution IS NULL")
|
||||
without_solutions = cursor.fetchone()[0]
|
||||
|
||||
print(f"\nDatabase Statistics:")
|
||||
print(f" Total records: {total_records}")
|
||||
print(f" With solutions: {with_solutions}")
|
||||
print(f" Without solutions: {without_solutions}")
|
||||
|
||||
# Show 5 sample solutions
|
||||
print(f"\nSample Solutions:")
|
||||
print("-" * 60)
|
||||
|
||||
cursor.execute("SELECT id, solution FROM arc_jsons WHERE solution IS NOT NULL LIMIT 5")
|
||||
samples = cursor.fetchall()
|
||||
|
||||
for i, (puzzle_id, solution_json) in enumerate(samples, 1):
|
||||
solution = json.loads(solution_json)
|
||||
rows = len(solution)
|
||||
cols = len(solution[0]) if rows > 0 else 0
|
||||
|
||||
print(f"\n{i}. Puzzle ID: {puzzle_id}")
|
||||
print(f" Grid size: {cols}×{rows}")
|
||||
print(f" Grid data:")
|
||||
|
||||
# Show grid visually
|
||||
for row in solution[:5]: # Show up to 5 rows
|
||||
print(f" {row}")
|
||||
|
||||
if rows > 5:
|
||||
print(f" ... ({rows - 5} more rows)")
|
||||
|
||||
connection.close()
|
||||
print(f"\n{'=' * 60}")
|
||||
print("Verification complete!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user