glam/tests/test_legal_form_migration.py
2025-11-21 22:12:33 +01:00

354 lines
11 KiB
Python

"""
Unit tests for legal form migration script.
Tests the migration from generic legal form enums to ISO 20275 ELF codes.
"""
import pytest
from pathlib import Path
import tempfile
import yaml
from datetime import datetime, timezone
# Import migration functions
import sys
sys.path.insert(0, str(Path(__file__).parent.parent / 'scripts'))
from migrate_legal_form_to_iso20275 import (
migrate_legal_form,
validate_elf_code,
load_elf_codes,
MigrationResult,
COUNTRY_MAPPINGS,
)
# ============================================================================
# FIXTURES
# ============================================================================
@pytest.fixture
def sample_elf_codes():
"""Sample ELF codes for testing"""
return {
'V44D': {
'country': 'Netherlands',
'country_code': 'NL',
'local_name': 'Stichting',
'transliterated_name': 'Stichting',
'status': 'ACTV',
},
'33MN': {
'country': 'Netherlands',
'country_code': 'NL',
'local_name': 'Vereniging met volledige rechtsbevoegdheid',
'transliterated_name': 'Vereniging',
'status': 'ACTV',
},
'A0W7': {
'country': 'Netherlands',
'country_code': 'NL',
'local_name': 'Publiekrechtelijke rechtspersoon',
'transliterated_name': 'Public entity',
'status': 'ACTV',
},
'9999': {
'country': '',
'country_code': '',
'local_name': '',
'transliterated_name': '',
'status': 'ACTV',
},
'ZZZZ': {
'country': 'Test',
'country_code': 'XX',
'local_name': 'Inactive Test',
'transliterated_name': 'Inactive Test',
'status': 'INAC',
},
}
@pytest.fixture
def sample_record_dutch():
"""Sample Dutch heritage institution record"""
return {
'id': 'https://w3id.org/heritage/org/rijksmuseum',
'legal_name': 'Stichting Rijksmuseum',
'legal_form': 'STICHTING', # Old enum value
'registration_number': 'NL-KvK-41208408',
'locations': [
{'city': 'Amsterdam', 'country': 'NL'}
],
'provenance': {
'data_source': 'CSV_REGISTRY',
'notes': 'Extracted from Dutch ISIL registry'
}
}
@pytest.fixture
def sample_record_already_migrated():
"""Record already using ISO 20275 code"""
return {
'id': 'https://w3id.org/heritage/org/bnf',
'legal_name': 'Bibliothèque nationale de France',
'legal_form': '5RDO', # Already ISO 20275
'locations': [
{'city': 'Paris', 'country': 'FR'}
]
}
# ============================================================================
# TESTS: ELF CODE VALIDATION
# ============================================================================
def test_validate_elf_code_valid(sample_elf_codes):
"""Test validation of valid ELF code"""
is_valid, error = validate_elf_code('V44D', sample_elf_codes)
assert is_valid is True
assert error is None
def test_validate_elf_code_invalid_format(sample_elf_codes):
"""Test validation rejects invalid format"""
is_valid, error = validate_elf_code('ABC', sample_elf_codes)
assert is_valid is False
assert 'Invalid format' in error
def test_validate_elf_code_not_in_registry(sample_elf_codes):
"""Test validation rejects code not in registry"""
is_valid, error = validate_elf_code('XXXX', sample_elf_codes)
assert is_valid is False
assert 'not found' in error
def test_validate_elf_code_inactive(sample_elf_codes):
"""Test validation rejects inactive code"""
is_valid, error = validate_elf_code('ZZZZ', sample_elf_codes)
assert is_valid is False
assert 'INACTIVE' in error
# ============================================================================
# TESTS: MIGRATION LOGIC
# ============================================================================
def test_migrate_dutch_stichting(sample_record_dutch, sample_elf_codes):
"""Test migrating Dutch stichting from enum to ELF code"""
result = migrate_legal_form(sample_record_dutch, 'NL', sample_elf_codes)
assert result.status == 'migrated'
assert result.old_value == 'STICHTING'
assert result.new_value == 'V44D'
assert result.country == 'NL'
assert result.confidence == 1.0
def test_migrate_already_iso20275(sample_record_already_migrated, sample_elf_codes):
"""Test that already-migrated records are unchanged"""
result = migrate_legal_form(sample_record_already_migrated, 'FR', sample_elf_codes)
assert result.status == 'unchanged'
assert result.old_value == '5RDO'
assert result.new_value == '5RDO'
assert 'Already valid' in result.notes
def test_migrate_no_legal_form(sample_elf_codes):
"""Test handling of record without legal_form field"""
record = {
'id': 'https://w3id.org/heritage/org/test',
'legal_name': 'Test Organization',
}
result = migrate_legal_form(record, 'NL', sample_elf_codes)
assert result.status == 'unchanged'
assert result.old_value is None
assert 'No legal_form' in result.notes
def test_migrate_unknown_enum(sample_elf_codes):
"""Test handling of unknown enum value"""
record = {
'id': 'https://w3id.org/heritage/org/test',
'legal_form': 'UNKNOWN_TYPE',
}
result = migrate_legal_form(record, 'NL', sample_elf_codes)
assert result.status == 'manual_review'
assert result.old_value == 'UNKNOWN_TYPE'
assert result.new_value is None
assert 'Unknown enum' in result.notes
def test_migrate_low_confidence(sample_elf_codes):
"""Test handling of low-confidence mapping"""
record = {
'id': 'https://w3id.org/heritage/org/test',
'legal_form': 'TRUST', # Trust has lower confidence in NL context
'locations': [{'country': 'NL'}]
}
result = migrate_legal_form(record, 'NL', sample_elf_codes, confidence_threshold=0.7)
assert result.status == 'manual_review'
assert result.old_value == 'TRUST'
assert result.new_value == 'V44D'
assert result.confidence == 0.6 # Below threshold
def test_migrate_country_inferred_from_location(sample_elf_codes):
"""Test country code inference from location data"""
record = {
'id': 'https://w3id.org/heritage/org/test',
'legal_form': 'GOVERNMENT_AGENCY',
'locations': [
{'city': 'Paris', 'country': 'FR'}
]
}
# Don't provide country_code explicitly
result = migrate_legal_form(record, None, sample_elf_codes)
# Should use default mapping since country not explicitly provided
# (location inference happens at higher level)
assert result.status == 'manual_review'
assert result.new_value == '5RDO' # Default mapping
# ============================================================================
# TESTS: COUNTRY-SPECIFIC MAPPINGS
# ============================================================================
def test_french_association_mapping(sample_elf_codes):
"""Test French association mapping"""
record = {
'id': 'https://w3id.org/heritage/org/test',
'legal_form': 'ASSOCIATION',
'locations': [{'country': 'FR'}]
}
result = migrate_legal_form(record, 'FR', sample_elf_codes)
# Note: BEWI not in sample_elf_codes, so will use validation check
assert result.old_value == 'ASSOCIATION'
# Check mapping exists
assert 'FR' in COUNTRY_MAPPINGS
assert 'ASSOCIATION' in COUNTRY_MAPPINGS['FR']
def test_german_stiftung_mapping(sample_elf_codes):
"""Test German foundation (Stiftung) mapping"""
assert 'DE' in COUNTRY_MAPPINGS
assert 'STICHTING' in COUNTRY_MAPPINGS['DE']
mapping = COUNTRY_MAPPINGS['DE']['STICHTING']
assert mapping.new_elf_code == 'V2YH'
assert mapping.confidence == 1.0
def test_uk_charity_mapping(sample_elf_codes):
"""Test UK charity mapping"""
assert 'GB' in COUNTRY_MAPPINGS
assert 'NGO' in COUNTRY_MAPPINGS['GB']
mapping = COUNTRY_MAPPINGS['GB']['NGO']
assert mapping.new_elf_code == '9HLU'
assert mapping.confidence == 0.95
def test_us_nonprofit_mapping(sample_elf_codes):
"""Test US nonprofit mapping"""
assert 'US' in COUNTRY_MAPPINGS
assert 'NGO' in COUNTRY_MAPPINGS['US']
mapping = COUNTRY_MAPPINGS['US']['NGO']
assert mapping.new_elf_code == 'QQQ0' # 501(c)(3)
assert mapping.confidence == 0.95
# ============================================================================
# TESTS: EDGE CASES
# ============================================================================
def test_migrate_invalid_existing_code(sample_elf_codes):
"""Test handling of invalid existing ISO 20275 code"""
record = {
'id': 'https://w3id.org/heritage/org/test',
'legal_form': 'ZZZZ', # Inactive code
}
result = migrate_legal_form(record, 'NL', sample_elf_codes)
assert result.status == 'error'
assert 'Invalid ISO 20275' in result.notes
def test_migrate_case_sensitive(sample_elf_codes):
"""Test that ELF codes are case-sensitive"""
record = {
'id': 'https://w3id.org/heritage/org/test',
'legal_form': 'v44d', # Lowercase
}
result = migrate_legal_form(record, 'NL', sample_elf_codes)
# Should be treated as unknown enum, not valid code
assert result.status == 'manual_review'
# ============================================================================
# TESTS: INTEGRATION
# ============================================================================
def test_load_real_elf_codes():
"""Test loading real ELF codes CSV file"""
elf_csv_path = Path('data/ontology/2023-09-28-elf-code-list-v1.5.csv')
if not elf_csv_path.exists():
pytest.skip("ELF codes CSV not available")
elf_codes = load_elf_codes(elf_csv_path)
assert len(elf_codes) > 2000 # Should have 2,200+ codes
assert 'V44D' in elf_codes
assert 'BEWI' in elf_codes
assert '5RDO' in elf_codes
assert elf_codes['V44D']['country_code'] == 'NL'
# ============================================================================
# PERFORMANCE TESTS
# ============================================================================
@pytest.mark.performance
def test_migration_performance(sample_elf_codes):
"""Test migration performance on large dataset"""
import time
# Create 1000 test records
records = []
for i in range(1000):
records.append({
'id': f'https://w3id.org/heritage/org/test-{i}',
'legal_form': ['STICHTING', 'ASSOCIATION', 'NGO', 'GOVERNMENT_AGENCY'][i % 4],
'locations': [{'country': 'NL'}]
})
start = time.time()
results = [migrate_legal_form(r, 'NL', sample_elf_codes) for r in records]
elapsed = time.time() - start
assert len(results) == 1000
assert elapsed < 5.0 # Should complete in under 5 seconds
print(f"\n Migrated 1000 records in {elapsed:.2f}s ({1000/elapsed:.0f} records/sec)")
if __name__ == '__main__':
pytest.main([__file__, '-v'])