317 lines
15 KiB
Python
317 lines
15 KiB
Python
"""
|
|
Tests for EU ISIL Registry Parser
|
|
"""
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
import tempfile
|
|
import os
|
|
|
|
from glam_extractor.parsers.eu_isil import EUIsilParser, EUIsilRecord
|
|
from glam_extractor.models import DataTier, DataSource, InstitutionType
|
|
|
|
|
|
# Sample EU ISIL text data (extracted from PDF)
|
|
SAMPLE_EU_ISIL_TEXT = '''
|
|
EUR-COR0001 European Committee of the Belliard 99-101
|
|
06-Jan-17 / CoR Belgium Brussels
|
|
Regions
|
|
|
|
EUR-EP00001 Library EP Rue Wiertz, Belgium Brussels
|
|
21-May-18 European Parliament
|
|
|
|
EUR- CURIA0001 Court of Justice of the rue du Fort Niedergrünewald Grand-Duché de Luxembourg
|
|
03-Aug-22 européenne CJUE Luxembourg L-2925
|
|
European Union (Grand Duchy of Luxembourg)
|
|
Curia / CVRIA
|
|
|
|
EUR-EUI0001 20-Jun-16 European University Institute Archives HAEU 156, Via Bolognese Italy Florence 50014
|
|
Historical Archives of the
|
|
European Union
|
|
'''
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_eu_text_file():
|
|
"""Create a temporary EU ISIL text file with sample data"""
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
|
|
f.write(SAMPLE_EU_ISIL_TEXT)
|
|
temp_path = f.name
|
|
|
|
yield temp_path
|
|
|
|
# Cleanup
|
|
os.unlink(temp_path)
|
|
|
|
|
|
class TestEUIsilRecord:
|
|
"""Test EUIsilRecord Pydantic model"""
|
|
|
|
def test_create_valid_record(self):
|
|
"""Test creating a valid EU ISIL record"""
|
|
record = EUIsilRecord(
|
|
isil="EUR-COR0001",
|
|
approval_date="06-Jan-17",
|
|
organisation_name="European Committee of the Regions",
|
|
subunit=None,
|
|
variants="CoR",
|
|
address="Belliard 99-101",
|
|
state="Belgium",
|
|
city="Brussels",
|
|
postcode=None
|
|
)
|
|
|
|
assert record.isil == "EUR-COR0001"
|
|
assert record.approval_date == "06-Jan-17"
|
|
assert record.organisation_name == "European Committee of the Regions"
|
|
assert record.variants == "CoR"
|
|
assert record.city == "Brussels"
|
|
assert record.state == "Belgium"
|
|
|
|
def test_isil_validation(self):
|
|
"""Test ISIL code validation"""
|
|
# Valid EUR ISIL codes
|
|
valid_codes = [
|
|
"EUR-COR0001",
|
|
"EUR-EP00001",
|
|
"EUR-CURIA0001",
|
|
"EUR-GSC0001",
|
|
"EUR-EC00001",
|
|
]
|
|
|
|
for code in valid_codes:
|
|
record = EUIsilRecord(
|
|
isil=code,
|
|
organisation_name="Test Institution"
|
|
)
|
|
assert record.isil == code
|
|
|
|
# Invalid codes should raise ValueError
|
|
with pytest.raises(ValueError):
|
|
EUIsilRecord(
|
|
isil="NL-AsdRM", # Not EUR prefix
|
|
organisation_name="Test"
|
|
)
|
|
|
|
with pytest.raises(ValueError):
|
|
EUIsilRecord(
|
|
isil="EURO-COR0001", # Wrong prefix
|
|
organisation_name="Test"
|
|
)
|
|
|
|
|
|
class TestEUIsilParser:
|
|
"""Test EU ISIL Parser"""
|
|
|
|
def test_parse_file(self, sample_eu_text_file):
|
|
"""Test parsing EU ISIL text file"""
|
|
parser = EUIsilParser()
|
|
records = list(parser.parse_file(sample_eu_text_file))
|
|
|
|
assert len(records) == 4
|
|
|
|
# Check first record (EUR-COR0001)
|
|
assert records[0].isil == "EUR-COR0001"
|
|
assert records[0].organisation_name == "European Committee of the Regions"
|
|
assert records[0].city == "Brussels"
|
|
assert records[0].state == "Belgium"
|
|
assert records[0].variants is None or "CoR" in records[0].variants
|
|
|
|
# Check second record (EUR-EP00001)
|
|
assert records[1].isil == "EUR-EP00001"
|
|
assert records[1].organisation_name == "European Parliament"
|
|
assert records[1].subunit == "Library"
|
|
assert records[1].variants is None or "EP" in records[1].variants
|
|
|
|
# Check CURIA record (with space in ISIL code)
|
|
curia_record = [r for r in records if r.isil == "EUR-CURIA0001"][0]
|
|
assert curia_record.organisation_name == "Court of Justice of the European Union"
|
|
assert curia_record.city == "Luxembourg"
|
|
assert curia_record.variants is None or "CJEU" in curia_record.variants or "CJUE" in curia_record.variants
|
|
|
|
# Check EUI record
|
|
eui_record = [r for r in records if r.isil == "EUR-EUI0001"][0]
|
|
assert eui_record.organisation_name == "European University Institute"
|
|
assert eui_record.subunit == "Archives"
|
|
assert eui_record.city == "Florence"
|
|
assert eui_record.state == "Italy"
|
|
|
|
def test_parse_real_file(self):
|
|
"""Test parsing the real EU ISIL directory file"""
|
|
real_file = Path("/Users/kempersc/apps/glam/data/isil/EUR/isil-directory.txt")
|
|
|
|
if not real_file.exists():
|
|
pytest.skip("Real EU ISIL file not found")
|
|
|
|
parser = EUIsilParser()
|
|
records = list(parser.parse_file(real_file))
|
|
|
|
# Should parse all 10 institutions
|
|
assert len(records) == 10
|
|
|
|
# Verify specific institutions
|
|
isil_codes = {r.isil for r in records}
|
|
expected_codes = {
|
|
"EUR-COR0001",
|
|
"EUR-EP00001",
|
|
"EUR-GSC0001",
|
|
"EUR-GSC0002",
|
|
"EUR-EUI0001",
|
|
"EUR-EC00001",
|
|
"EUR-EC00002",
|
|
"EUR-EESC0001",
|
|
"EUR-CURIA0001",
|
|
"EUR-EUI0002"
|
|
}
|
|
assert isil_codes == expected_codes
|
|
|
|
def test_to_heritage_custodian(self, sample_eu_text_file):
|
|
"""Test conversion to HeritageCustodian model"""
|
|
parser = EUIsilParser()
|
|
records = list(parser.parse_file(sample_eu_text_file))
|
|
|
|
# Convert first record
|
|
custodian = parser.to_heritage_custodian(records[0])
|
|
|
|
assert custodian.name == "European Committee of the Regions"
|
|
assert str(custodian.institution_type) == 'OFFICIAL_INSTITUTION'
|
|
|
|
# Check identifiers
|
|
assert len(custodian.identifiers) == 1
|
|
assert custodian.identifiers[0].identifier_scheme == "ISIL"
|
|
assert custodian.identifiers[0].identifier_value == "EUR-COR0001"
|
|
|
|
# Check locations
|
|
assert len(custodian.locations) == 1
|
|
assert custodian.locations[0].city == "Brussels"
|
|
assert custodian.locations[0].country == "BE"
|
|
|
|
# Check provenance
|
|
assert str(custodian.provenance.data_source) == 'ISIL_REGISTRY'
|
|
assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE'
|
|
|
|
def test_institution_type_mapping(self, sample_eu_text_file):
|
|
"""Test that EU institutions are classified correctly"""
|
|
parser = EUIsilParser()
|
|
records = list(parser.parse_file(sample_eu_text_file))
|
|
|
|
for record in records:
|
|
custodian = parser.to_heritage_custodian(record)
|
|
|
|
# Type should match subunit:
|
|
# - Library subunits → LIBRARY
|
|
# - Archive subunits → ARCHIVE
|
|
# - Other EU institutions → OFFICIAL_INSTITUTION
|
|
if record.subunit and 'library' in record.subunit.lower():
|
|
assert str(custodian.institution_type) == 'LIBRARY'
|
|
elif record.subunit and 'archive' in record.subunit.lower():
|
|
assert str(custodian.institution_type) == 'ARCHIVE'
|
|
else:
|
|
assert str(custodian.institution_type) == 'OFFICIAL_INSTITUTION'
|
|
|
|
def test_ghcid_generation(self, sample_eu_text_file):
|
|
"""Test GHCID generation for EU institutions"""
|
|
parser = EUIsilParser()
|
|
records = list(parser.parse_file(sample_eu_text_file))
|
|
|
|
# Convert and check GHCIDs
|
|
for record in records:
|
|
custodian = parser.to_heritage_custodian(record)
|
|
|
|
# GHCID should be generated
|
|
assert custodian.ghcid_current is not None
|
|
assert custodian.ghcid_uuid is not None
|
|
assert custodian.ghcid_numeric is not None
|
|
|
|
# GHCID should follow format: CC-RR-CCC-T-XXX
|
|
# Examples:
|
|
# - Brussels, Belgium: BE-00-BRU-O-XXX (Official Institution)
|
|
# - Florence, Italy: IT-00-FLO-A-XXX (Archives)
|
|
# - Luxembourg: LU-00-LUX-O-XXX (Official Institution)
|
|
# - Library subunits: XX-00-XXX-L-XXX (Library)
|
|
ghcid_parts = custodian.ghcid_current.split('-')
|
|
assert len(ghcid_parts) == 5
|
|
assert ghcid_parts[0] in ['BE', 'IT', 'LU'] # Country codes
|
|
assert ghcid_parts[1] == '00' # Region code for EU institutions
|
|
assert ghcid_parts[3] in ['O', 'L', 'A'] # Institution type codes
|
|
|
|
def test_space_in_isil_code(self):
|
|
"""Test handling of ISIL codes with spaces (EUR- CURIA0001)"""
|
|
text_with_space = '''
|
|
Cour de justice de l'Union CJEU
|
|
EUR- CURIA0001 Court of Justice of the rue du Fort Niedergrünewald Grand-Duché de Luxembourg
|
|
03-Aug-22 européenne CJUE Luxembourg L-2925
|
|
European Union (Grand Duchy of Luxembourg)
|
|
Curia / CVRIA
|
|
'''
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
|
|
f.write(text_with_space)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
parser = EUIsilParser()
|
|
records = list(parser.parse_file(temp_path))
|
|
|
|
assert len(records) == 1
|
|
# Space should be normalized out
|
|
assert records[0].isil == "EUR-CURIA0001"
|
|
assert records[0].organisation_name == "Court of Justice of the European Union"
|
|
finally:
|
|
os.unlink(temp_path)
|
|
|
|
def test_multi_line_organization_name(self):
|
|
"""Test reconstruction of organization names split across multiple lines"""
|
|
text_multi_line = '''
|
|
EUR-COR0001 European Committee of the Belliard 99-101
|
|
06-Jan-17 / CoR Belgium Brussels
|
|
Regions
|
|
'''
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
|
|
f.write(text_multi_line)
|
|
temp_path = f.name
|
|
|
|
try:
|
|
parser = EUIsilParser()
|
|
records = list(parser.parse_file(temp_path))
|
|
|
|
assert len(records) == 1
|
|
# Should reconstruct full name from lines 1 and 3
|
|
assert records[0].organisation_name == "European Committee of the Regions"
|
|
finally:
|
|
os.unlink(temp_path)
|
|
|
|
def test_empty_file(self):
|
|
"""Test parsing empty file"""
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
|
|
f.write("")
|
|
temp_path = f.name
|
|
|
|
try:
|
|
parser = EUIsilParser()
|
|
records = list(parser.parse_file(temp_path))
|
|
assert len(records) == 0
|
|
finally:
|
|
os.unlink(temp_path)
|
|
|
|
def test_country_code_mapping(self, sample_eu_text_file):
|
|
"""Test country code mapping for EU institutions"""
|
|
parser = EUIsilParser()
|
|
records = list(parser.parse_file(sample_eu_text_file))
|
|
|
|
country_mapping = {
|
|
"Belgium": "BE",
|
|
"Italy": "IT",
|
|
"Grand-Duché de Luxembourg": "LU",
|
|
"Grand Duchy of Luxembourg": "LU"
|
|
}
|
|
|
|
for record in records:
|
|
custodian = parser.to_heritage_custodian(record)
|
|
if custodian.locations:
|
|
location = custodian.locations[0]
|
|
if record.state:
|
|
expected_country = country_mapping.get(record.state)
|
|
if expected_country:
|
|
assert location.country == expected_country
|