- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
177 lines
6.4 KiB
Python
177 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix validation errors in Libya dataset before RDF export.
|
|
|
|
Fixes:
|
|
1. LEARNING_MANAGEMENT platform type → WEBSITE (not in PlatformTypeEnum)
|
|
2. Empty platform URLs → Remove empty platforms
|
|
3. BCE dates (negative years) → Convert to string description in event_description
|
|
4. Invalid source_documentation URLs → Convert to notes or remove
|
|
5. mailto: URL scheme → Convert to https://
|
|
"""
|
|
|
|
import yaml
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Paths
|
|
LIBYA_YAML = Path(__file__).parent.parent / "data/instances/libya/libyan_institutions.yaml"
|
|
BACKUP_YAML = Path(__file__).parent.parent / "data/instances/libya/libyan_institutions_backup_20251111.yaml"
|
|
|
|
def fix_platform_type(platform):
|
|
"""Fix LEARNING_MANAGEMENT → WEBSITE."""
|
|
if platform.get('platform_type') == 'LEARNING_MANAGEMENT':
|
|
platform['platform_type'] = 'WEBSITE'
|
|
return True
|
|
return False
|
|
|
|
def fix_empty_url(platform):
|
|
"""Return True if platform has empty URL (should be removed)."""
|
|
return platform.get('platform_url', '').strip() == ''
|
|
|
|
def fix_bce_date(event):
|
|
"""Fix BCE dates (negative years) by moving to description."""
|
|
event_date = event.get('event_date', '')
|
|
if isinstance(event_date, str) and event_date.startswith('-'):
|
|
# Extract year (e.g., '-1100-01-01' → '1100 BCE')
|
|
year = event_date.split('-')[1]
|
|
bce_year = f"{year} BCE"
|
|
|
|
# Update event description to include BCE date
|
|
desc = event.get('event_description', '')
|
|
if 'BCE' not in desc and 'BC' not in desc:
|
|
event['event_description'] = f"{desc} (circa {bce_year})"
|
|
|
|
# Remove invalid event_date
|
|
del event['event_date']
|
|
return True
|
|
return False
|
|
|
|
def fix_source_documentation(event):
|
|
"""Fix invalid source_documentation URLs."""
|
|
source_doc = event.get('source_documentation', '')
|
|
|
|
# If it's a conversational reference or relative path, remove it
|
|
if source_doc and not source_doc.startswith('http'):
|
|
# Move to event_description if useful context
|
|
if 'Provincial Survey' in source_doc or 'Conversation' in source_doc:
|
|
desc = event.get('event_description', '')
|
|
if source_doc not in desc:
|
|
event['event_description'] = f"{desc} (Source: {source_doc})"
|
|
|
|
# Remove invalid URL
|
|
del event['source_documentation']
|
|
return True
|
|
return False
|
|
|
|
def fix_mailto_url(identifier):
|
|
"""Fix mailto: URL scheme → convert to website URL."""
|
|
url = identifier.get('identifier_url', '')
|
|
if url.startswith('mailto:'):
|
|
# Extract email and convert to https URL pattern
|
|
email = url.replace('mailto:', '')
|
|
domain = email.split('@')[1]
|
|
identifier['identifier_url'] = f"https://{domain}"
|
|
return True
|
|
return False
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("Libya Dataset Validation Fixes")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load YAML
|
|
print(f"📂 Loading: {LIBYA_YAML}")
|
|
with open(LIBYA_YAML, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
print(f" Loaded {len(data)} institutions\n")
|
|
|
|
# Backup original
|
|
print(f"💾 Creating backup: {BACKUP_YAML}")
|
|
with open(BACKUP_YAML, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, sort_keys=False)
|
|
print(" ✅ Backup created\n")
|
|
|
|
# Track changes
|
|
stats = {
|
|
'platform_type_fixed': 0,
|
|
'empty_urls_removed': 0,
|
|
'bce_dates_fixed': 0,
|
|
'invalid_sources_fixed': 0,
|
|
'mailto_fixed': 0
|
|
}
|
|
|
|
# Process each institution
|
|
print("🔧 Applying fixes...\n")
|
|
|
|
for i, inst in enumerate(data):
|
|
name = inst.get('name', f'Institution {i}')
|
|
|
|
# Fix digital platforms
|
|
if 'digital_platforms' in inst and inst['digital_platforms']:
|
|
platforms_to_keep = []
|
|
for platform in inst['digital_platforms']:
|
|
# Fix LEARNING_MANAGEMENT
|
|
if fix_platform_type(platform):
|
|
stats['platform_type_fixed'] += 1
|
|
print(f" ✏️ {name}: LEARNING_MANAGEMENT → WEBSITE")
|
|
|
|
# Remove empty URLs
|
|
if fix_empty_url(platform):
|
|
stats['empty_urls_removed'] += 1
|
|
print(f" 🗑️ {name}: Removed platform with empty URL")
|
|
else:
|
|
platforms_to_keep.append(platform)
|
|
|
|
inst['digital_platforms'] = platforms_to_keep
|
|
|
|
# Fix change_history
|
|
if 'change_history' in inst and inst['change_history']:
|
|
for event in inst['change_history']:
|
|
# Fix BCE dates
|
|
if fix_bce_date(event):
|
|
stats['bce_dates_fixed'] += 1
|
|
print(f" 📅 {name}: Fixed BCE date in event")
|
|
|
|
# Fix invalid source_documentation
|
|
if fix_source_documentation(event):
|
|
stats['invalid_sources_fixed'] += 1
|
|
print(f" 🔗 {name}: Fixed invalid source_documentation")
|
|
|
|
# Fix identifiers (mailto:)
|
|
if 'identifiers' in inst and inst['identifiers']:
|
|
for identifier in inst['identifiers']:
|
|
if fix_mailto_url(identifier):
|
|
stats['mailto_fixed'] += 1
|
|
print(f" 📧 {name}: Fixed mailto: URL")
|
|
|
|
# Save fixed YAML
|
|
print(f"\n💾 Saving fixed dataset: {LIBYA_YAML}")
|
|
with open(LIBYA_YAML, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print(" ✅ Saved\n")
|
|
|
|
# Report
|
|
print("=" * 80)
|
|
print("📊 FIX SUMMARY")
|
|
print("=" * 80)
|
|
print(f" Platform types fixed (LEARNING_MANAGEMENT → WEBSITE): {stats['platform_type_fixed']}")
|
|
print(f" Empty platform URLs removed: {stats['empty_urls_removed']}")
|
|
print(f" BCE dates fixed: {stats['bce_dates_fixed']}")
|
|
print(f" Invalid source_documentation fixed: {stats['invalid_sources_fixed']}")
|
|
print(f" mailto: URLs fixed: {stats['mailto_fixed']}")
|
|
print()
|
|
print(f" Total fixes: {sum(stats.values())}")
|
|
print()
|
|
print("✅ Libya dataset ready for validation!")
|
|
print()
|
|
print("Next step: Re-run export script:")
|
|
print(" python scripts/export_libya_to_rdf.py")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|