glam/scripts/fix_libya_validation_errors.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

177 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
Fix validation errors in Libya dataset before RDF export.
Fixes:
1. LEARNING_MANAGEMENT platform type → WEBSITE (not in PlatformTypeEnum)
2. Empty platform URLs → Remove empty platforms
3. BCE dates (negative years) → Convert to string description in event_description
4. Invalid source_documentation URLs → Convert to notes or remove
5. mailto: URL scheme → Convert to https://
"""
import yaml
import sys
from pathlib import Path
from datetime import datetime
# Paths
LIBYA_YAML = Path(__file__).parent.parent / "data/instances/libya/libyan_institutions.yaml"
BACKUP_YAML = Path(__file__).parent.parent / "data/instances/libya/libyan_institutions_backup_20251111.yaml"
def fix_platform_type(platform):
"""Fix LEARNING_MANAGEMENT → WEBSITE."""
if platform.get('platform_type') == 'LEARNING_MANAGEMENT':
platform['platform_type'] = 'WEBSITE'
return True
return False
def fix_empty_url(platform):
"""Return True if platform has empty URL (should be removed)."""
return platform.get('platform_url', '').strip() == ''
def fix_bce_date(event):
"""Fix BCE dates (negative years) by moving to description."""
event_date = event.get('event_date', '')
if isinstance(event_date, str) and event_date.startswith('-'):
# Extract year (e.g., '-1100-01-01' → '1100 BCE')
year = event_date.split('-')[1]
bce_year = f"{year} BCE"
# Update event description to include BCE date
desc = event.get('event_description', '')
if 'BCE' not in desc and 'BC' not in desc:
event['event_description'] = f"{desc} (circa {bce_year})"
# Remove invalid event_date
del event['event_date']
return True
return False
def fix_source_documentation(event):
"""Fix invalid source_documentation URLs."""
source_doc = event.get('source_documentation', '')
# If it's a conversational reference or relative path, remove it
if source_doc and not source_doc.startswith('http'):
# Move to event_description if useful context
if 'Provincial Survey' in source_doc or 'Conversation' in source_doc:
desc = event.get('event_description', '')
if source_doc not in desc:
event['event_description'] = f"{desc} (Source: {source_doc})"
# Remove invalid URL
del event['source_documentation']
return True
return False
def fix_mailto_url(identifier):
"""Fix mailto: URL scheme → convert to website URL."""
url = identifier.get('identifier_url', '')
if url.startswith('mailto:'):
# Extract email and convert to https URL pattern
email = url.replace('mailto:', '')
domain = email.split('@')[1]
identifier['identifier_url'] = f"https://{domain}"
return True
return False
def main():
print("=" * 80)
print("Libya Dataset Validation Fixes")
print("=" * 80)
print()
# Load YAML
print(f"📂 Loading: {LIBYA_YAML}")
with open(LIBYA_YAML, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
print(f" Loaded {len(data)} institutions\n")
# Backup original
print(f"💾 Creating backup: {BACKUP_YAML}")
with open(BACKUP_YAML, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, sort_keys=False)
print(" ✅ Backup created\n")
# Track changes
stats = {
'platform_type_fixed': 0,
'empty_urls_removed': 0,
'bce_dates_fixed': 0,
'invalid_sources_fixed': 0,
'mailto_fixed': 0
}
# Process each institution
print("🔧 Applying fixes...\n")
for i, inst in enumerate(data):
name = inst.get('name', f'Institution {i}')
# Fix digital platforms
if 'digital_platforms' in inst and inst['digital_platforms']:
platforms_to_keep = []
for platform in inst['digital_platforms']:
# Fix LEARNING_MANAGEMENT
if fix_platform_type(platform):
stats['platform_type_fixed'] += 1
print(f" ✏️ {name}: LEARNING_MANAGEMENT → WEBSITE")
# Remove empty URLs
if fix_empty_url(platform):
stats['empty_urls_removed'] += 1
print(f" 🗑️ {name}: Removed platform with empty URL")
else:
platforms_to_keep.append(platform)
inst['digital_platforms'] = platforms_to_keep
# Fix change_history
if 'change_history' in inst and inst['change_history']:
for event in inst['change_history']:
# Fix BCE dates
if fix_bce_date(event):
stats['bce_dates_fixed'] += 1
print(f" 📅 {name}: Fixed BCE date in event")
# Fix invalid source_documentation
if fix_source_documentation(event):
stats['invalid_sources_fixed'] += 1
print(f" 🔗 {name}: Fixed invalid source_documentation")
# Fix identifiers (mailto:)
if 'identifiers' in inst and inst['identifiers']:
for identifier in inst['identifiers']:
if fix_mailto_url(identifier):
stats['mailto_fixed'] += 1
print(f" 📧 {name}: Fixed mailto: URL")
# Save fixed YAML
print(f"\n💾 Saving fixed dataset: {LIBYA_YAML}")
with open(LIBYA_YAML, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(" ✅ Saved\n")
# Report
print("=" * 80)
print("📊 FIX SUMMARY")
print("=" * 80)
print(f" Platform types fixed (LEARNING_MANAGEMENT → WEBSITE): {stats['platform_type_fixed']}")
print(f" Empty platform URLs removed: {stats['empty_urls_removed']}")
print(f" BCE dates fixed: {stats['bce_dates_fixed']}")
print(f" Invalid source_documentation fixed: {stats['invalid_sources_fixed']}")
print(f" mailto: URLs fixed: {stats['mailto_fixed']}")
print()
print(f" Total fixes: {sum(stats.values())}")
print()
print("✅ Libya dataset ready for validation!")
print()
print("Next step: Re-run export script:")
print(" python scripts/export_libya_to_rdf.py")
print()
if __name__ == '__main__':
main()