354 lines
11 KiB
Python
354 lines
11 KiB
Python
"""
|
|
Unit tests for legal form migration script.
|
|
|
|
Tests the migration from generic legal form enums to ISO 20275 ELF codes.
|
|
"""
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
import tempfile
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
|
|
# Import migration functions
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / 'scripts'))
|
|
from migrate_legal_form_to_iso20275 import (
|
|
migrate_legal_form,
|
|
validate_elf_code,
|
|
load_elf_codes,
|
|
MigrationResult,
|
|
COUNTRY_MAPPINGS,
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# FIXTURES
|
|
# ============================================================================
|
|
|
|
@pytest.fixture
|
|
def sample_elf_codes():
|
|
"""Sample ELF codes for testing"""
|
|
return {
|
|
'V44D': {
|
|
'country': 'Netherlands',
|
|
'country_code': 'NL',
|
|
'local_name': 'Stichting',
|
|
'transliterated_name': 'Stichting',
|
|
'status': 'ACTV',
|
|
},
|
|
'33MN': {
|
|
'country': 'Netherlands',
|
|
'country_code': 'NL',
|
|
'local_name': 'Vereniging met volledige rechtsbevoegdheid',
|
|
'transliterated_name': 'Vereniging',
|
|
'status': 'ACTV',
|
|
},
|
|
'A0W7': {
|
|
'country': 'Netherlands',
|
|
'country_code': 'NL',
|
|
'local_name': 'Publiekrechtelijke rechtspersoon',
|
|
'transliterated_name': 'Public entity',
|
|
'status': 'ACTV',
|
|
},
|
|
'9999': {
|
|
'country': '',
|
|
'country_code': '',
|
|
'local_name': '',
|
|
'transliterated_name': '',
|
|
'status': 'ACTV',
|
|
},
|
|
'ZZZZ': {
|
|
'country': 'Test',
|
|
'country_code': 'XX',
|
|
'local_name': 'Inactive Test',
|
|
'transliterated_name': 'Inactive Test',
|
|
'status': 'INAC',
|
|
},
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_record_dutch():
|
|
"""Sample Dutch heritage institution record"""
|
|
return {
|
|
'id': 'https://w3id.org/heritage/org/rijksmuseum',
|
|
'legal_name': 'Stichting Rijksmuseum',
|
|
'legal_form': 'STICHTING', # Old enum value
|
|
'registration_number': 'NL-KvK-41208408',
|
|
'locations': [
|
|
{'city': 'Amsterdam', 'country': 'NL'}
|
|
],
|
|
'provenance': {
|
|
'data_source': 'CSV_REGISTRY',
|
|
'notes': 'Extracted from Dutch ISIL registry'
|
|
}
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_record_already_migrated():
|
|
"""Record already using ISO 20275 code"""
|
|
return {
|
|
'id': 'https://w3id.org/heritage/org/bnf',
|
|
'legal_name': 'Bibliothèque nationale de France',
|
|
'legal_form': '5RDO', # Already ISO 20275
|
|
'locations': [
|
|
{'city': 'Paris', 'country': 'FR'}
|
|
]
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# TESTS: ELF CODE VALIDATION
|
|
# ============================================================================
|
|
|
|
def test_validate_elf_code_valid(sample_elf_codes):
|
|
"""Test validation of valid ELF code"""
|
|
is_valid, error = validate_elf_code('V44D', sample_elf_codes)
|
|
assert is_valid is True
|
|
assert error is None
|
|
|
|
|
|
def test_validate_elf_code_invalid_format(sample_elf_codes):
|
|
"""Test validation rejects invalid format"""
|
|
is_valid, error = validate_elf_code('ABC', sample_elf_codes)
|
|
assert is_valid is False
|
|
assert 'Invalid format' in error
|
|
|
|
|
|
def test_validate_elf_code_not_in_registry(sample_elf_codes):
|
|
"""Test validation rejects code not in registry"""
|
|
is_valid, error = validate_elf_code('XXXX', sample_elf_codes)
|
|
assert is_valid is False
|
|
assert 'not found' in error
|
|
|
|
|
|
def test_validate_elf_code_inactive(sample_elf_codes):
|
|
"""Test validation rejects inactive code"""
|
|
is_valid, error = validate_elf_code('ZZZZ', sample_elf_codes)
|
|
assert is_valid is False
|
|
assert 'INACTIVE' in error
|
|
|
|
|
|
# ============================================================================
|
|
# TESTS: MIGRATION LOGIC
|
|
# ============================================================================
|
|
|
|
def test_migrate_dutch_stichting(sample_record_dutch, sample_elf_codes):
|
|
"""Test migrating Dutch stichting from enum to ELF code"""
|
|
result = migrate_legal_form(sample_record_dutch, 'NL', sample_elf_codes)
|
|
|
|
assert result.status == 'migrated'
|
|
assert result.old_value == 'STICHTING'
|
|
assert result.new_value == 'V44D'
|
|
assert result.country == 'NL'
|
|
assert result.confidence == 1.0
|
|
|
|
|
|
def test_migrate_already_iso20275(sample_record_already_migrated, sample_elf_codes):
|
|
"""Test that already-migrated records are unchanged"""
|
|
result = migrate_legal_form(sample_record_already_migrated, 'FR', sample_elf_codes)
|
|
|
|
assert result.status == 'unchanged'
|
|
assert result.old_value == '5RDO'
|
|
assert result.new_value == '5RDO'
|
|
assert 'Already valid' in result.notes
|
|
|
|
|
|
def test_migrate_no_legal_form(sample_elf_codes):
|
|
"""Test handling of record without legal_form field"""
|
|
record = {
|
|
'id': 'https://w3id.org/heritage/org/test',
|
|
'legal_name': 'Test Organization',
|
|
}
|
|
|
|
result = migrate_legal_form(record, 'NL', sample_elf_codes)
|
|
|
|
assert result.status == 'unchanged'
|
|
assert result.old_value is None
|
|
assert 'No legal_form' in result.notes
|
|
|
|
|
|
def test_migrate_unknown_enum(sample_elf_codes):
|
|
"""Test handling of unknown enum value"""
|
|
record = {
|
|
'id': 'https://w3id.org/heritage/org/test',
|
|
'legal_form': 'UNKNOWN_TYPE',
|
|
}
|
|
|
|
result = migrate_legal_form(record, 'NL', sample_elf_codes)
|
|
|
|
assert result.status == 'manual_review'
|
|
assert result.old_value == 'UNKNOWN_TYPE'
|
|
assert result.new_value is None
|
|
assert 'Unknown enum' in result.notes
|
|
|
|
|
|
def test_migrate_low_confidence(sample_elf_codes):
|
|
"""Test handling of low-confidence mapping"""
|
|
record = {
|
|
'id': 'https://w3id.org/heritage/org/test',
|
|
'legal_form': 'TRUST', # Trust has lower confidence in NL context
|
|
'locations': [{'country': 'NL'}]
|
|
}
|
|
|
|
result = migrate_legal_form(record, 'NL', sample_elf_codes, confidence_threshold=0.7)
|
|
|
|
assert result.status == 'manual_review'
|
|
assert result.old_value == 'TRUST'
|
|
assert result.new_value == 'V44D'
|
|
assert result.confidence == 0.6 # Below threshold
|
|
|
|
|
|
def test_migrate_country_inferred_from_location(sample_elf_codes):
|
|
"""Test country code inference from location data"""
|
|
record = {
|
|
'id': 'https://w3id.org/heritage/org/test',
|
|
'legal_form': 'GOVERNMENT_AGENCY',
|
|
'locations': [
|
|
{'city': 'Paris', 'country': 'FR'}
|
|
]
|
|
}
|
|
|
|
# Don't provide country_code explicitly
|
|
result = migrate_legal_form(record, None, sample_elf_codes)
|
|
|
|
# Should use default mapping since country not explicitly provided
|
|
# (location inference happens at higher level)
|
|
assert result.status == 'manual_review'
|
|
assert result.new_value == '5RDO' # Default mapping
|
|
|
|
|
|
# ============================================================================
|
|
# TESTS: COUNTRY-SPECIFIC MAPPINGS
|
|
# ============================================================================
|
|
|
|
def test_french_association_mapping(sample_elf_codes):
|
|
"""Test French association mapping"""
|
|
record = {
|
|
'id': 'https://w3id.org/heritage/org/test',
|
|
'legal_form': 'ASSOCIATION',
|
|
'locations': [{'country': 'FR'}]
|
|
}
|
|
|
|
result = migrate_legal_form(record, 'FR', sample_elf_codes)
|
|
|
|
# Note: BEWI not in sample_elf_codes, so will use validation check
|
|
assert result.old_value == 'ASSOCIATION'
|
|
# Check mapping exists
|
|
assert 'FR' in COUNTRY_MAPPINGS
|
|
assert 'ASSOCIATION' in COUNTRY_MAPPINGS['FR']
|
|
|
|
|
|
def test_german_stiftung_mapping(sample_elf_codes):
|
|
"""Test German foundation (Stiftung) mapping"""
|
|
assert 'DE' in COUNTRY_MAPPINGS
|
|
assert 'STICHTING' in COUNTRY_MAPPINGS['DE']
|
|
|
|
mapping = COUNTRY_MAPPINGS['DE']['STICHTING']
|
|
assert mapping.new_elf_code == 'V2YH'
|
|
assert mapping.confidence == 1.0
|
|
|
|
|
|
def test_uk_charity_mapping(sample_elf_codes):
|
|
"""Test UK charity mapping"""
|
|
assert 'GB' in COUNTRY_MAPPINGS
|
|
assert 'NGO' in COUNTRY_MAPPINGS['GB']
|
|
|
|
mapping = COUNTRY_MAPPINGS['GB']['NGO']
|
|
assert mapping.new_elf_code == '9HLU'
|
|
assert mapping.confidence == 0.95
|
|
|
|
|
|
def test_us_nonprofit_mapping(sample_elf_codes):
|
|
"""Test US nonprofit mapping"""
|
|
assert 'US' in COUNTRY_MAPPINGS
|
|
assert 'NGO' in COUNTRY_MAPPINGS['US']
|
|
|
|
mapping = COUNTRY_MAPPINGS['US']['NGO']
|
|
assert mapping.new_elf_code == 'QQQ0' # 501(c)(3)
|
|
assert mapping.confidence == 0.95
|
|
|
|
|
|
# ============================================================================
|
|
# TESTS: EDGE CASES
|
|
# ============================================================================
|
|
|
|
def test_migrate_invalid_existing_code(sample_elf_codes):
|
|
"""Test handling of invalid existing ISO 20275 code"""
|
|
record = {
|
|
'id': 'https://w3id.org/heritage/org/test',
|
|
'legal_form': 'ZZZZ', # Inactive code
|
|
}
|
|
|
|
result = migrate_legal_form(record, 'NL', sample_elf_codes)
|
|
|
|
assert result.status == 'error'
|
|
assert 'Invalid ISO 20275' in result.notes
|
|
|
|
|
|
def test_migrate_case_sensitive(sample_elf_codes):
|
|
"""Test that ELF codes are case-sensitive"""
|
|
record = {
|
|
'id': 'https://w3id.org/heritage/org/test',
|
|
'legal_form': 'v44d', # Lowercase
|
|
}
|
|
|
|
result = migrate_legal_form(record, 'NL', sample_elf_codes)
|
|
|
|
# Should be treated as unknown enum, not valid code
|
|
assert result.status == 'manual_review'
|
|
|
|
|
|
# ============================================================================
|
|
# TESTS: INTEGRATION
|
|
# ============================================================================
|
|
|
|
def test_load_real_elf_codes():
|
|
"""Test loading real ELF codes CSV file"""
|
|
elf_csv_path = Path('data/ontology/2023-09-28-elf-code-list-v1.5.csv')
|
|
|
|
if not elf_csv_path.exists():
|
|
pytest.skip("ELF codes CSV not available")
|
|
|
|
elf_codes = load_elf_codes(elf_csv_path)
|
|
|
|
assert len(elf_codes) > 2000 # Should have 2,200+ codes
|
|
assert 'V44D' in elf_codes
|
|
assert 'BEWI' in elf_codes
|
|
assert '5RDO' in elf_codes
|
|
assert elf_codes['V44D']['country_code'] == 'NL'
|
|
|
|
|
|
# ============================================================================
|
|
# PERFORMANCE TESTS
|
|
# ============================================================================
|
|
|
|
@pytest.mark.performance
|
|
def test_migration_performance(sample_elf_codes):
|
|
"""Test migration performance on large dataset"""
|
|
import time
|
|
|
|
# Create 1000 test records
|
|
records = []
|
|
for i in range(1000):
|
|
records.append({
|
|
'id': f'https://w3id.org/heritage/org/test-{i}',
|
|
'legal_form': ['STICHTING', 'ASSOCIATION', 'NGO', 'GOVERNMENT_AGENCY'][i % 4],
|
|
'locations': [{'country': 'NL'}]
|
|
})
|
|
|
|
start = time.time()
|
|
|
|
results = [migrate_legal_form(r, 'NL', sample_elf_codes) for r in records]
|
|
|
|
elapsed = time.time() - start
|
|
|
|
assert len(results) == 1000
|
|
assert elapsed < 5.0 # Should complete in under 5 seconds
|
|
print(f"\n Migrated 1000 records in {elapsed:.2f}s ({1000/elapsed:.0f} records/sec)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v'])
|