glam/scripts/fix_libya_validation_errors.py

#!/usr/bin/env python3
"""
Fix validation errors in Libya dataset before RDF export.

Fixes:
1. LEARNING_MANAGEMENT platform type → WEBSITE (not in PlatformTypeEnum)
2. Empty platform URLs → Remove empty platforms
3. BCE dates (negative years) → Convert to string description in event_description
4. Invalid source_documentation URLs → Convert to notes or remove
5. mailto: URL scheme → Convert to https://
"""

import yaml
import sys
from pathlib import Path
from datetime import datetime

# Paths
LIBYA_YAML = Path(__file__).parent.parent / "data/instances/libya/libyan_institutions.yaml"
BACKUP_YAML = Path(__file__).parent.parent / "data/instances/libya/libyan_institutions_backup_20251111.yaml"

def fix_platform_type(platform):
    """Fix LEARNING_MANAGEMENT → WEBSITE."""
    if platform.get('platform_type') == 'LEARNING_MANAGEMENT':
        platform['platform_type'] = 'WEBSITE'
        return True
    return False

def fix_empty_url(platform):
    """Return True if platform has empty URL (should be removed)."""
    return platform.get('platform_url', '').strip() == ''

def fix_bce_date(event):
    """Fix BCE dates (negative years) by moving to description."""
    event_date = event.get('event_date', '')
    if isinstance(event_date, str) and event_date.startswith('-'):
        # Extract year (e.g., '-1100-01-01' → '1100 BCE')
        year = event_date.split('-')[1]
        bce_year = f"{year} BCE"

        # Update event description to include BCE date
        desc = event.get('event_description', '')
        if 'BCE' not in desc and 'BC' not in desc:
            event['event_description'] = f"{desc} (circa {bce_year})"

        # Remove invalid event_date
        del event['event_date']
        return True
    return False

def fix_source_documentation(event):
    """Fix invalid source_documentation URLs."""
    source_doc = event.get('source_documentation', '')

    # If it's a conversational reference or relative path, remove it
    if source_doc and not source_doc.startswith('http'):
        # Move to event_description if useful context
        if 'Provincial Survey' in source_doc or 'Conversation' in source_doc:
            desc = event.get('event_description', '')
            if source_doc not in desc:
                event['event_description'] = f"{desc} (Source: {source_doc})"

        # Remove invalid URL
        del event['source_documentation']
        return True
    return False

def fix_mailto_url(identifier):
    """Fix mailto: URL scheme → convert to website URL."""
    url = identifier.get('identifier_url', '')
    if url.startswith('mailto:'):
        # Extract email and convert to https URL pattern
        email = url.replace('mailto:', '')
        domain = email.split('@')[1]
        identifier['identifier_url'] = f"https://{domain}"
        return True
    return False

def main():
    print("=" * 80)
    print("Libya Dataset Validation Fixes")
    print("=" * 80)
    print()

    # Load YAML
    print(f"📂 Loading: {LIBYA_YAML}")
    with open(LIBYA_YAML, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    print(f"   Loaded {len(data)} institutions\n")

    # Backup original
    print(f"💾 Creating backup: {BACKUP_YAML}")
    with open(BACKUP_YAML, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, sort_keys=False)
    print("   ✅ Backup created\n")

    # Track changes
    stats = {
        'platform_type_fixed': 0,
        'empty_urls_removed': 0,
        'bce_dates_fixed': 0,
        'invalid_sources_fixed': 0,
        'mailto_fixed': 0
    }

    # Process each institution
    print("🔧 Applying fixes...\n")

    for i, inst in enumerate(data):
        name = inst.get('name', f'Institution {i}')

        # Fix digital platforms
        if 'digital_platforms' in inst and inst['digital_platforms']:
            platforms_to_keep = []
            for platform in inst['digital_platforms']:
                # Fix LEARNING_MANAGEMENT
                if fix_platform_type(platform):
                    stats['platform_type_fixed'] += 1
                    print(f"   ✏️  {name}: LEARNING_MANAGEMENT → WEBSITE")

                # Remove empty URLs
                if fix_empty_url(platform):
                    stats['empty_urls_removed'] += 1
                    print(f"   🗑️  {name}: Removed platform with empty URL")
                else:
                    platforms_to_keep.append(platform)

            inst['digital_platforms'] = platforms_to_keep

        # Fix change_history
        if 'change_history' in inst and inst['change_history']:
            for event in inst['change_history']:
                # Fix BCE dates
                if fix_bce_date(event):
                    stats['bce_dates_fixed'] += 1
                    print(f"   📅 {name}: Fixed BCE date in event")

                # Fix invalid source_documentation
                if fix_source_documentation(event):
                    stats['invalid_sources_fixed'] += 1
                    print(f"   🔗 {name}: Fixed invalid source_documentation")

        # Fix identifiers (mailto:)
        if 'identifiers' in inst and inst['identifiers']:
            for identifier in inst['identifiers']:
                if fix_mailto_url(identifier):
                    stats['mailto_fixed'] += 1
                    print(f"   📧 {name}: Fixed mailto: URL")

    # Save fixed YAML
    print(f"\n💾 Saving fixed dataset: {LIBYA_YAML}")
    with open(LIBYA_YAML, 'w', encoding='utf-8') as f:
        yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)

    print("   ✅ Saved\n")

    # Report
    print("=" * 80)
    print("📊 FIX SUMMARY")
    print("=" * 80)
    print(f"   Platform types fixed (LEARNING_MANAGEMENT → WEBSITE): {stats['platform_type_fixed']}")
    print(f"   Empty platform URLs removed: {stats['empty_urls_removed']}")
    print(f"   BCE dates fixed: {stats['bce_dates_fixed']}")
    print(f"   Invalid source_documentation fixed: {stats['invalid_sources_fixed']}")
    print(f"   mailto: URLs fixed: {stats['mailto_fixed']}")
    print()
    print(f"   Total fixes: {sum(stats.values())}")
    print()
    print("✅ Libya dataset ready for validation!")
    print()
    print("Next step: Re-run export script:")
    print("   python scripts/export_libya_to_rdf.py")
    print()

if __name__ == '__main__':
    main()