""" Unit tests for legal form migration script. Tests the migration from generic legal form enums to ISO 20275 ELF codes. """ import pytest from pathlib import Path import tempfile import yaml from datetime import datetime, timezone # Import migration functions import sys sys.path.insert(0, str(Path(__file__).parent.parent / 'scripts')) from migrate_legal_form_to_iso20275 import ( migrate_legal_form, validate_elf_code, load_elf_codes, MigrationResult, COUNTRY_MAPPINGS, ) # ============================================================================ # FIXTURES # ============================================================================ @pytest.fixture def sample_elf_codes(): """Sample ELF codes for testing""" return { 'V44D': { 'country': 'Netherlands', 'country_code': 'NL', 'local_name': 'Stichting', 'transliterated_name': 'Stichting', 'status': 'ACTV', }, '33MN': { 'country': 'Netherlands', 'country_code': 'NL', 'local_name': 'Vereniging met volledige rechtsbevoegdheid', 'transliterated_name': 'Vereniging', 'status': 'ACTV', }, 'A0W7': { 'country': 'Netherlands', 'country_code': 'NL', 'local_name': 'Publiekrechtelijke rechtspersoon', 'transliterated_name': 'Public entity', 'status': 'ACTV', }, '9999': { 'country': '', 'country_code': '', 'local_name': '', 'transliterated_name': '', 'status': 'ACTV', }, 'ZZZZ': { 'country': 'Test', 'country_code': 'XX', 'local_name': 'Inactive Test', 'transliterated_name': 'Inactive Test', 'status': 'INAC', }, } @pytest.fixture def sample_record_dutch(): """Sample Dutch heritage institution record""" return { 'id': 'https://w3id.org/heritage/org/rijksmuseum', 'legal_name': 'Stichting Rijksmuseum', 'legal_form': 'STICHTING', # Old enum value 'registration_number': 'NL-KvK-41208408', 'locations': [ {'city': 'Amsterdam', 'country': 'NL'} ], 'provenance': { 'data_source': 'CSV_REGISTRY', 'notes': 'Extracted from Dutch ISIL registry' } } @pytest.fixture def sample_record_already_migrated(): """Record already using ISO 20275 code""" return { 'id': 'https://w3id.org/heritage/org/bnf', 'legal_name': 'Bibliothèque nationale de France', 'legal_form': '5RDO', # Already ISO 20275 'locations': [ {'city': 'Paris', 'country': 'FR'} ] } # ============================================================================ # TESTS: ELF CODE VALIDATION # ============================================================================ def test_validate_elf_code_valid(sample_elf_codes): """Test validation of valid ELF code""" is_valid, error = validate_elf_code('V44D', sample_elf_codes) assert is_valid is True assert error is None def test_validate_elf_code_invalid_format(sample_elf_codes): """Test validation rejects invalid format""" is_valid, error = validate_elf_code('ABC', sample_elf_codes) assert is_valid is False assert 'Invalid format' in error def test_validate_elf_code_not_in_registry(sample_elf_codes): """Test validation rejects code not in registry""" is_valid, error = validate_elf_code('XXXX', sample_elf_codes) assert is_valid is False assert 'not found' in error def test_validate_elf_code_inactive(sample_elf_codes): """Test validation rejects inactive code""" is_valid, error = validate_elf_code('ZZZZ', sample_elf_codes) assert is_valid is False assert 'INACTIVE' in error # ============================================================================ # TESTS: MIGRATION LOGIC # ============================================================================ def test_migrate_dutch_stichting(sample_record_dutch, sample_elf_codes): """Test migrating Dutch stichting from enum to ELF code""" result = migrate_legal_form(sample_record_dutch, 'NL', sample_elf_codes) assert result.status == 'migrated' assert result.old_value == 'STICHTING' assert result.new_value == 'V44D' assert result.country == 'NL' assert result.confidence == 1.0 def test_migrate_already_iso20275(sample_record_already_migrated, sample_elf_codes): """Test that already-migrated records are unchanged""" result = migrate_legal_form(sample_record_already_migrated, 'FR', sample_elf_codes) assert result.status == 'unchanged' assert result.old_value == '5RDO' assert result.new_value == '5RDO' assert 'Already valid' in result.notes def test_migrate_no_legal_form(sample_elf_codes): """Test handling of record without legal_form field""" record = { 'id': 'https://w3id.org/heritage/org/test', 'legal_name': 'Test Organization', } result = migrate_legal_form(record, 'NL', sample_elf_codes) assert result.status == 'unchanged' assert result.old_value is None assert 'No legal_form' in result.notes def test_migrate_unknown_enum(sample_elf_codes): """Test handling of unknown enum value""" record = { 'id': 'https://w3id.org/heritage/org/test', 'legal_form': 'UNKNOWN_TYPE', } result = migrate_legal_form(record, 'NL', sample_elf_codes) assert result.status == 'manual_review' assert result.old_value == 'UNKNOWN_TYPE' assert result.new_value is None assert 'Unknown enum' in result.notes def test_migrate_low_confidence(sample_elf_codes): """Test handling of low-confidence mapping""" record = { 'id': 'https://w3id.org/heritage/org/test', 'legal_form': 'TRUST', # Trust has lower confidence in NL context 'locations': [{'country': 'NL'}] } result = migrate_legal_form(record, 'NL', sample_elf_codes, confidence_threshold=0.7) assert result.status == 'manual_review' assert result.old_value == 'TRUST' assert result.new_value == 'V44D' assert result.confidence == 0.6 # Below threshold def test_migrate_country_inferred_from_location(sample_elf_codes): """Test country code inference from location data""" record = { 'id': 'https://w3id.org/heritage/org/test', 'legal_form': 'GOVERNMENT_AGENCY', 'locations': [ {'city': 'Paris', 'country': 'FR'} ] } # Don't provide country_code explicitly result = migrate_legal_form(record, None, sample_elf_codes) # Should use default mapping since country not explicitly provided # (location inference happens at higher level) assert result.status == 'manual_review' assert result.new_value == '5RDO' # Default mapping # ============================================================================ # TESTS: COUNTRY-SPECIFIC MAPPINGS # ============================================================================ def test_french_association_mapping(sample_elf_codes): """Test French association mapping""" record = { 'id': 'https://w3id.org/heritage/org/test', 'legal_form': 'ASSOCIATION', 'locations': [{'country': 'FR'}] } result = migrate_legal_form(record, 'FR', sample_elf_codes) # Note: BEWI not in sample_elf_codes, so will use validation check assert result.old_value == 'ASSOCIATION' # Check mapping exists assert 'FR' in COUNTRY_MAPPINGS assert 'ASSOCIATION' in COUNTRY_MAPPINGS['FR'] def test_german_stiftung_mapping(sample_elf_codes): """Test German foundation (Stiftung) mapping""" assert 'DE' in COUNTRY_MAPPINGS assert 'STICHTING' in COUNTRY_MAPPINGS['DE'] mapping = COUNTRY_MAPPINGS['DE']['STICHTING'] assert mapping.new_elf_code == 'V2YH' assert mapping.confidence == 1.0 def test_uk_charity_mapping(sample_elf_codes): """Test UK charity mapping""" assert 'GB' in COUNTRY_MAPPINGS assert 'NGO' in COUNTRY_MAPPINGS['GB'] mapping = COUNTRY_MAPPINGS['GB']['NGO'] assert mapping.new_elf_code == '9HLU' assert mapping.confidence == 0.95 def test_us_nonprofit_mapping(sample_elf_codes): """Test US nonprofit mapping""" assert 'US' in COUNTRY_MAPPINGS assert 'NGO' in COUNTRY_MAPPINGS['US'] mapping = COUNTRY_MAPPINGS['US']['NGO'] assert mapping.new_elf_code == 'QQQ0' # 501(c)(3) assert mapping.confidence == 0.95 # ============================================================================ # TESTS: EDGE CASES # ============================================================================ def test_migrate_invalid_existing_code(sample_elf_codes): """Test handling of invalid existing ISO 20275 code""" record = { 'id': 'https://w3id.org/heritage/org/test', 'legal_form': 'ZZZZ', # Inactive code } result = migrate_legal_form(record, 'NL', sample_elf_codes) assert result.status == 'error' assert 'Invalid ISO 20275' in result.notes def test_migrate_case_sensitive(sample_elf_codes): """Test that ELF codes are case-sensitive""" record = { 'id': 'https://w3id.org/heritage/org/test', 'legal_form': 'v44d', # Lowercase } result = migrate_legal_form(record, 'NL', sample_elf_codes) # Should be treated as unknown enum, not valid code assert result.status == 'manual_review' # ============================================================================ # TESTS: INTEGRATION # ============================================================================ def test_load_real_elf_codes(): """Test loading real ELF codes CSV file""" elf_csv_path = Path('data/ontology/2023-09-28-elf-code-list-v1.5.csv') if not elf_csv_path.exists(): pytest.skip("ELF codes CSV not available") elf_codes = load_elf_codes(elf_csv_path) assert len(elf_codes) > 2000 # Should have 2,200+ codes assert 'V44D' in elf_codes assert 'BEWI' in elf_codes assert '5RDO' in elf_codes assert elf_codes['V44D']['country_code'] == 'NL' # ============================================================================ # PERFORMANCE TESTS # ============================================================================ @pytest.mark.performance def test_migration_performance(sample_elf_codes): """Test migration performance on large dataset""" import time # Create 1000 test records records = [] for i in range(1000): records.append({ 'id': f'https://w3id.org/heritage/org/test-{i}', 'legal_form': ['STICHTING', 'ASSOCIATION', 'NGO', 'GOVERNMENT_AGENCY'][i % 4], 'locations': [{'country': 'NL'}] }) start = time.time() results = [migrate_legal_form(r, 'NL', sample_elf_codes) for r in records] elapsed = time.time() - start assert len(results) == 1000 assert elapsed < 5.0 # Should complete in under 5 seconds print(f"\n Migrated 1000 records in {elapsed:.2f}s ({1000/elapsed:.0f} records/sec)") if __name__ == '__main__': pytest.main([__file__, '-v'])