This commit is contained in:
bmachado
2025-11-05 00:24:05 +00:00
commit b8856c0660
1157 changed files with 26817 additions and 0 deletions

161
scripts/upload_to_db.py Executable file
View File

@ -0,0 +1,161 @@
#!/usr/bin/env python3
"""
Upload ARC puzzle data to MariaDB database
Reads credentials from .env file and inserts all JSON files
"""
import json
import os
import sys
from pathlib import Path
import pymysql
from dotenv import load_dotenv
# Force unbuffered output
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)
def load_env_config():
"""Load database configuration from .env file"""
load_dotenv()
config = {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME'),
'port': int(os.getenv('DB_PORT', 3306)),
'charset': 'utf8mb4'
}
return config
def get_all_json_files():
"""Get all JSON files from training and evaluation directories"""
base_dir = Path('arc_data')
training_files = sorted(base_dir.glob('training/*.json'))
evaluation_files = sorted(base_dir.glob('evaluation/*.json'))
return training_files + evaluation_files
def insert_puzzle(cursor, file_path):
"""Insert a single puzzle into the database"""
# Extract ID from filename
puzzle_id = file_path.stem
# Read JSON content
with open(file_path, 'r') as f:
json_content = f.read().strip()
# Insert into database using parameterized query (prevents SQL injection)
sql = "INSERT INTO arc_jsons (id, json) VALUES (%s, %s)"
cursor.execute(sql, (puzzle_id, json_content))
return puzzle_id
def main():
print("ARC Data Upload to MariaDB")
print("=" * 50)
# Load configuration
try:
config = load_env_config()
print(f"✓ Loaded configuration from .env")
print(f" Host: {config['host']}")
print(f" Database: {config['database']}")
print(f" User: {config['user']}")
except Exception as e:
print(f"✗ Error loading configuration: {e}")
return 1
# Get all files
all_files = get_all_json_files()
print(f"✓ Found {len(all_files)} JSON files")
# Connect to database
try:
print(f"\nConnecting to database...")
connection = pymysql.connect(**config)
print(f"✓ Connected successfully")
except Exception as e:
print(f"✗ Database connection failed: {e}")
return 1
try:
cursor = connection.cursor()
# Check if table exists
cursor.execute("SHOW TABLES LIKE 'arc_jsons'")
if not cursor.fetchone():
print(f"✗ Table 'arc_jsons' does not exist")
return 1
# Get current count
cursor.execute("SELECT COUNT(*) FROM arc_jsons")
initial_count = cursor.fetchone()[0]
print(f"✓ Table 'arc_jsons' exists (current rows: {initial_count})")
# Ask for confirmation (unless --yes flag is provided)
if '--yes' not in sys.argv:
print(f"\n⚠ About to insert {len(all_files)} records")
response = input("Continue? (yes/no): ").strip().lower()
if response not in ['yes', 'y']:
print("Upload cancelled")
return 0
else:
print(f"\n⚠ About to insert {len(all_files)} records (auto-confirmed with --yes flag)")
print(f"\nInserting records...")
inserted = 0
errors = 0
for i, file_path in enumerate(all_files, 1):
try:
puzzle_id = insert_puzzle(cursor, file_path)
inserted += 1
# Show progress every 100 records
if i % 100 == 0 or i == len(all_files):
print(f" Progress: {i}/{len(all_files)} ({inserted} inserted, {errors} errors)")
except pymysql.IntegrityError as e:
# Likely duplicate key
if "Duplicate entry" in str(e):
errors += 1
if errors <= 5: # Only show first 5 errors
print(f" ⚠ Duplicate: {file_path.stem}")
else:
raise
except Exception as e:
errors += 1
print(f" ✗ Error with {file_path.stem}: {e}")
if errors > 10:
print(f" Too many errors, stopping...")
break
# Commit the transaction
connection.commit()
# Get final count
cursor.execute("SELECT COUNT(*) FROM arc_jsons")
final_count = cursor.fetchone()[0]
print(f"\n{'=' * 50}")
print(f"✓ Upload complete!")
print(f" Successfully inserted: {inserted}")
print(f" Errors/duplicates: {errors}")
print(f" Database rows: {initial_count}{final_count} (+{final_count - initial_count})")
except Exception as e:
connection.rollback()
print(f"\n✗ Error during upload: {e}")
return 1
finally:
connection.close()
print(f"\n✓ Database connection closed")
return 0
if __name__ == '__main__':
sys.exit(main())