#!/usr/bin/env python3 """ Fix validation errors in Libya dataset before RDF export. Fixes: 1. LEARNING_MANAGEMENT platform type → WEBSITE (not in PlatformTypeEnum) 2. Empty platform URLs → Remove empty platforms 3. BCE dates (negative years) → Convert to string description in event_description 4. Invalid source_documentation URLs → Convert to notes or remove 5. mailto: URL scheme → Convert to https:// """ import yaml import sys from pathlib import Path from datetime import datetime # Paths LIBYA_YAML = Path(__file__).parent.parent / "data/instances/libya/libyan_institutions.yaml" BACKUP_YAML = Path(__file__).parent.parent / "data/instances/libya/libyan_institutions_backup_20251111.yaml" def fix_platform_type(platform): """Fix LEARNING_MANAGEMENT → WEBSITE.""" if platform.get('platform_type') == 'LEARNING_MANAGEMENT': platform['platform_type'] = 'WEBSITE' return True return False def fix_empty_url(platform): """Return True if platform has empty URL (should be removed).""" return platform.get('platform_url', '').strip() == '' def fix_bce_date(event): """Fix BCE dates (negative years) by moving to description.""" event_date = event.get('event_date', '') if isinstance(event_date, str) and event_date.startswith('-'): # Extract year (e.g., '-1100-01-01' → '1100 BCE') year = event_date.split('-')[1] bce_year = f"{year} BCE" # Update event description to include BCE date desc = event.get('event_description', '') if 'BCE' not in desc and 'BC' not in desc: event['event_description'] = f"{desc} (circa {bce_year})" # Remove invalid event_date del event['event_date'] return True return False def fix_source_documentation(event): """Fix invalid source_documentation URLs.""" source_doc = event.get('source_documentation', '') # If it's a conversational reference or relative path, remove it if source_doc and not source_doc.startswith('http'): # Move to event_description if useful context if 'Provincial Survey' in source_doc or 'Conversation' in source_doc: desc = event.get('event_description', '') if source_doc not in desc: event['event_description'] = f"{desc} (Source: {source_doc})" # Remove invalid URL del event['source_documentation'] return True return False def fix_mailto_url(identifier): """Fix mailto: URL scheme → convert to website URL.""" url = identifier.get('identifier_url', '') if url.startswith('mailto:'): # Extract email and convert to https URL pattern email = url.replace('mailto:', '') domain = email.split('@')[1] identifier['identifier_url'] = f"https://{domain}" return True return False def main(): print("=" * 80) print("Libya Dataset Validation Fixes") print("=" * 80) print() # Load YAML print(f"📂 Loading: {LIBYA_YAML}") with open(LIBYA_YAML, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) print(f" Loaded {len(data)} institutions\n") # Backup original print(f"💾 Creating backup: {BACKUP_YAML}") with open(BACKUP_YAML, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False) print(" ✅ Backup created\n") # Track changes stats = { 'platform_type_fixed': 0, 'empty_urls_removed': 0, 'bce_dates_fixed': 0, 'invalid_sources_fixed': 0, 'mailto_fixed': 0 } # Process each institution print("🔧 Applying fixes...\n") for i, inst in enumerate(data): name = inst.get('name', f'Institution {i}') # Fix digital platforms if 'digital_platforms' in inst and inst['digital_platforms']: platforms_to_keep = [] for platform in inst['digital_platforms']: # Fix LEARNING_MANAGEMENT if fix_platform_type(platform): stats['platform_type_fixed'] += 1 print(f" ✏️ {name}: LEARNING_MANAGEMENT → WEBSITE") # Remove empty URLs if fix_empty_url(platform): stats['empty_urls_removed'] += 1 print(f" 🗑️ {name}: Removed platform with empty URL") else: platforms_to_keep.append(platform) inst['digital_platforms'] = platforms_to_keep # Fix change_history if 'change_history' in inst and inst['change_history']: for event in inst['change_history']: # Fix BCE dates if fix_bce_date(event): stats['bce_dates_fixed'] += 1 print(f" 📅 {name}: Fixed BCE date in event") # Fix invalid source_documentation if fix_source_documentation(event): stats['invalid_sources_fixed'] += 1 print(f" 🔗 {name}: Fixed invalid source_documentation") # Fix identifiers (mailto:) if 'identifiers' in inst and inst['identifiers']: for identifier in inst['identifiers']: if fix_mailto_url(identifier): stats['mailto_fixed'] += 1 print(f" 📧 {name}: Fixed mailto: URL") # Save fixed YAML print(f"\n💾 Saving fixed dataset: {LIBYA_YAML}") with open(LIBYA_YAML, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(" ✅ Saved\n") # Report print("=" * 80) print("📊 FIX SUMMARY") print("=" * 80) print(f" Platform types fixed (LEARNING_MANAGEMENT → WEBSITE): {stats['platform_type_fixed']}") print(f" Empty platform URLs removed: {stats['empty_urls_removed']}") print(f" BCE dates fixed: {stats['bce_dates_fixed']}") print(f" Invalid source_documentation fixed: {stats['invalid_sources_fixed']}") print(f" mailto: URLs fixed: {stats['mailto_fixed']}") print() print(f" Total fixes: {sum(stats.values())}") print() print("✅ Libya dataset ready for validation!") print() print("Next step: Re-run export script:") print(" python scripts/export_libya_to_rdf.py") print() if __name__ == '__main__': main()