glam/tests/parsers/test_eu_isil.py
2025-12-05 15:30:23 +01:00

317 lines
15 KiB
Python

"""
Tests for EU ISIL Registry Parser
"""
import pytest
from pathlib import Path
import tempfile
import os
from glam_extractor.parsers.eu_isil import EUIsilParser, EUIsilRecord
from glam_extractor.models import DataTier, DataSource, InstitutionType
# Sample EU ISIL text data (extracted from PDF)
SAMPLE_EU_ISIL_TEXT = '''
EUR-COR0001 European Committee of the Belliard 99-101
06-Jan-17 / CoR Belgium Brussels
Regions
EUR-EP00001 Library EP Rue Wiertz, Belgium Brussels
21-May-18 European Parliament
EUR- CURIA0001 Court of Justice of the rue du Fort Niedergrünewald Grand-Duché de Luxembourg
03-Aug-22 européenne CJUE Luxembourg L-2925
European Union (Grand Duchy of Luxembourg)
Curia / CVRIA
EUR-EUI0001 20-Jun-16 European University Institute Archives HAEU 156, Via Bolognese Italy Florence 50014
Historical Archives of the
European Union
'''
@pytest.fixture
def sample_eu_text_file():
"""Create a temporary EU ISIL text file with sample data"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
f.write(SAMPLE_EU_ISIL_TEXT)
temp_path = f.name
yield temp_path
# Cleanup
os.unlink(temp_path)
class TestEUIsilRecord:
"""Test EUIsilRecord Pydantic model"""
def test_create_valid_record(self):
"""Test creating a valid EU ISIL record"""
record = EUIsilRecord(
isil="EUR-COR0001",
approval_date="06-Jan-17",
organisation_name="European Committee of the Regions",
subunit=None,
variants="CoR",
address="Belliard 99-101",
state="Belgium",
city="Brussels",
postcode=None
)
assert record.isil == "EUR-COR0001"
assert record.approval_date == "06-Jan-17"
assert record.organisation_name == "European Committee of the Regions"
assert record.variants == "CoR"
assert record.city == "Brussels"
assert record.state == "Belgium"
def test_isil_validation(self):
"""Test ISIL code validation"""
# Valid EUR ISIL codes
valid_codes = [
"EUR-COR0001",
"EUR-EP00001",
"EUR-CURIA0001",
"EUR-GSC0001",
"EUR-EC00001",
]
for code in valid_codes:
record = EUIsilRecord(
isil=code,
organisation_name="Test Institution"
)
assert record.isil == code
# Invalid codes should raise ValueError
with pytest.raises(ValueError):
EUIsilRecord(
isil="NL-AsdRM", # Not EUR prefix
organisation_name="Test"
)
with pytest.raises(ValueError):
EUIsilRecord(
isil="EURO-COR0001", # Wrong prefix
organisation_name="Test"
)
class TestEUIsilParser:
"""Test EU ISIL Parser"""
def test_parse_file(self, sample_eu_text_file):
"""Test parsing EU ISIL text file"""
parser = EUIsilParser()
records = list(parser.parse_file(sample_eu_text_file))
assert len(records) == 4
# Check first record (EUR-COR0001)
assert records[0].isil == "EUR-COR0001"
assert records[0].organisation_name == "European Committee of the Regions"
assert records[0].city == "Brussels"
assert records[0].state == "Belgium"
assert records[0].variants is None or "CoR" in records[0].variants
# Check second record (EUR-EP00001)
assert records[1].isil == "EUR-EP00001"
assert records[1].organisation_name == "European Parliament"
assert records[1].subunit == "Library"
assert records[1].variants is None or "EP" in records[1].variants
# Check CURIA record (with space in ISIL code)
curia_record = [r for r in records if r.isil == "EUR-CURIA0001"][0]
assert curia_record.organisation_name == "Court of Justice of the European Union"
assert curia_record.city == "Luxembourg"
assert curia_record.variants is None or "CJEU" in curia_record.variants or "CJUE" in curia_record.variants
# Check EUI record
eui_record = [r for r in records if r.isil == "EUR-EUI0001"][0]
assert eui_record.organisation_name == "European University Institute"
assert eui_record.subunit == "Archives"
assert eui_record.city == "Florence"
assert eui_record.state == "Italy"
def test_parse_real_file(self):
"""Test parsing the real EU ISIL directory file"""
real_file = Path("/Users/kempersc/apps/glam/data/isil/EUR/isil-directory.txt")
if not real_file.exists():
pytest.skip("Real EU ISIL file not found")
parser = EUIsilParser()
records = list(parser.parse_file(real_file))
# Should parse all 10 institutions
assert len(records) == 10
# Verify specific institutions
isil_codes = {r.isil for r in records}
expected_codes = {
"EUR-COR0001",
"EUR-EP00001",
"EUR-GSC0001",
"EUR-GSC0002",
"EUR-EUI0001",
"EUR-EC00001",
"EUR-EC00002",
"EUR-EESC0001",
"EUR-CURIA0001",
"EUR-EUI0002"
}
assert isil_codes == expected_codes
def test_to_heritage_custodian(self, sample_eu_text_file):
"""Test conversion to HeritageCustodian model"""
parser = EUIsilParser()
records = list(parser.parse_file(sample_eu_text_file))
# Convert first record
custodian = parser.to_heritage_custodian(records[0])
assert custodian.name == "European Committee of the Regions"
assert str(custodian.institution_type) == 'OFFICIAL_INSTITUTION'
# Check identifiers
assert len(custodian.identifiers) == 1
assert custodian.identifiers[0].identifier_scheme == "ISIL"
assert custodian.identifiers[0].identifier_value == "EUR-COR0001"
# Check locations
assert len(custodian.locations) == 1
assert custodian.locations[0].city == "Brussels"
assert custodian.locations[0].country == "BE"
# Check provenance
assert str(custodian.provenance.data_source) == 'ISIL_REGISTRY'
assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
def test_institution_type_mapping(self, sample_eu_text_file):
"""Test that EU institutions are classified correctly"""
parser = EUIsilParser()
records = list(parser.parse_file(sample_eu_text_file))
for record in records:
custodian = parser.to_heritage_custodian(record)
# Type should match subunit:
# - Library subunits → LIBRARY
# - Archive subunits → ARCHIVE
# - Other EU institutions → OFFICIAL_INSTITUTION
if record.subunit and 'library' in record.subunit.lower():
assert str(custodian.institution_type) == 'LIBRARY'
elif record.subunit and 'archive' in record.subunit.lower():
assert str(custodian.institution_type) == 'ARCHIVE'
else:
assert str(custodian.institution_type) == 'OFFICIAL_INSTITUTION'
def test_ghcid_generation(self, sample_eu_text_file):
"""Test GHCID generation for EU institutions"""
parser = EUIsilParser()
records = list(parser.parse_file(sample_eu_text_file))
# Convert and check GHCIDs
for record in records:
custodian = parser.to_heritage_custodian(record)
# GHCID should be generated
assert custodian.ghcid_current is not None
assert custodian.ghcid_uuid is not None
assert custodian.ghcid_numeric is not None
# GHCID should follow format: CC-RR-CCC-T-XXX
# Examples:
# - Brussels, Belgium: BE-00-BRU-O-XXX (Official Institution)
# - Florence, Italy: IT-00-FLO-A-XXX (Archives)
# - Luxembourg: LU-00-LUX-O-XXX (Official Institution)
# - Library subunits: XX-00-XXX-L-XXX (Library)
ghcid_parts = custodian.ghcid_current.split('-')
assert len(ghcid_parts) == 5
assert ghcid_parts[0] in ['BE', 'IT', 'LU'] # Country codes
assert ghcid_parts[1] == '00' # Region code for EU institutions
assert ghcid_parts[3] in ['O', 'L', 'A'] # Institution type codes
def test_space_in_isil_code(self):
"""Test handling of ISIL codes with spaces (EUR- CURIA0001)"""
text_with_space = '''
Cour de justice de l'Union CJEU
EUR- CURIA0001 Court of Justice of the rue du Fort Niedergrünewald Grand-Duché de Luxembourg
03-Aug-22 européenne CJUE Luxembourg L-2925
European Union (Grand Duchy of Luxembourg)
Curia / CVRIA
'''
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
f.write(text_with_space)
temp_path = f.name
try:
parser = EUIsilParser()
records = list(parser.parse_file(temp_path))
assert len(records) == 1
# Space should be normalized out
assert records[0].isil == "EUR-CURIA0001"
assert records[0].organisation_name == "Court of Justice of the European Union"
finally:
os.unlink(temp_path)
def test_multi_line_organization_name(self):
"""Test reconstruction of organization names split across multiple lines"""
text_multi_line = '''
EUR-COR0001 European Committee of the Belliard 99-101
06-Jan-17 / CoR Belgium Brussels
Regions
'''
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
f.write(text_multi_line)
temp_path = f.name
try:
parser = EUIsilParser()
records = list(parser.parse_file(temp_path))
assert len(records) == 1
# Should reconstruct full name from lines 1 and 3
assert records[0].organisation_name == "European Committee of the Regions"
finally:
os.unlink(temp_path)
def test_empty_file(self):
"""Test parsing empty file"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
f.write("")
temp_path = f.name
try:
parser = EUIsilParser()
records = list(parser.parse_file(temp_path))
assert len(records) == 0
finally:
os.unlink(temp_path)
def test_country_code_mapping(self, sample_eu_text_file):
"""Test country code mapping for EU institutions"""
parser = EUIsilParser()
records = list(parser.parse_file(sample_eu_text_file))
country_mapping = {
"Belgium": "BE",
"Italy": "IT",
"Grand-Duché de Luxembourg": "LU",
"Grand Duchy of Luxembourg": "LU"
}
for record in records:
custodian = parser.to_heritage_custodian(record)
if custodian.locations:
location = custodian.locations[0]
if record.state:
expected_country = country_mapping.get(record.state)
if expected_country:
assert location.country == expected_country