""" Tests for EU ISIL Registry Parser """ import pytest from pathlib import Path import tempfile import os from glam_extractor.parsers.eu_isil import EUIsilParser, EUIsilRecord from glam_extractor.models import DataTier, DataSource, InstitutionType # Sample EU ISIL text data (extracted from PDF) SAMPLE_EU_ISIL_TEXT = ''' EUR-COR0001 European Committee of the Belliard 99-101 06-Jan-17 / CoR Belgium Brussels Regions EUR-EP00001 Library EP Rue Wiertz, Belgium Brussels 21-May-18 European Parliament EUR- CURIA0001 Court of Justice of the rue du Fort Niedergrünewald Grand-Duché de Luxembourg 03-Aug-22 européenne CJUE Luxembourg L-2925 European Union (Grand Duchy of Luxembourg) Curia / CVRIA EUR-EUI0001 20-Jun-16 European University Institute Archives HAEU 156, Via Bolognese Italy Florence 50014 Historical Archives of the European Union ''' @pytest.fixture def sample_eu_text_file(): """Create a temporary EU ISIL text file with sample data""" with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f: f.write(SAMPLE_EU_ISIL_TEXT) temp_path = f.name yield temp_path # Cleanup os.unlink(temp_path) class TestEUIsilRecord: """Test EUIsilRecord Pydantic model""" def test_create_valid_record(self): """Test creating a valid EU ISIL record""" record = EUIsilRecord( isil="EUR-COR0001", approval_date="06-Jan-17", organisation_name="European Committee of the Regions", subunit=None, variants="CoR", address="Belliard 99-101", state="Belgium", city="Brussels", postcode=None ) assert record.isil == "EUR-COR0001" assert record.approval_date == "06-Jan-17" assert record.organisation_name == "European Committee of the Regions" assert record.variants == "CoR" assert record.city == "Brussels" assert record.state == "Belgium" def test_isil_validation(self): """Test ISIL code validation""" # Valid EUR ISIL codes valid_codes = [ "EUR-COR0001", "EUR-EP00001", "EUR-CURIA0001", "EUR-GSC0001", "EUR-EC00001", ] for code in valid_codes: record = EUIsilRecord( isil=code, organisation_name="Test Institution" ) assert record.isil == code # Invalid codes should raise ValueError with pytest.raises(ValueError): EUIsilRecord( isil="NL-AsdRM", # Not EUR prefix organisation_name="Test" ) with pytest.raises(ValueError): EUIsilRecord( isil="EURO-COR0001", # Wrong prefix organisation_name="Test" ) class TestEUIsilParser: """Test EU ISIL Parser""" def test_parse_file(self, sample_eu_text_file): """Test parsing EU ISIL text file""" parser = EUIsilParser() records = list(parser.parse_file(sample_eu_text_file)) assert len(records) == 4 # Check first record (EUR-COR0001) assert records[0].isil == "EUR-COR0001" assert records[0].organisation_name == "European Committee of the Regions" assert records[0].city == "Brussels" assert records[0].state == "Belgium" assert records[0].variants is None or "CoR" in records[0].variants # Check second record (EUR-EP00001) assert records[1].isil == "EUR-EP00001" assert records[1].organisation_name == "European Parliament" assert records[1].subunit == "Library" assert records[1].variants is None or "EP" in records[1].variants # Check CURIA record (with space in ISIL code) curia_record = [r for r in records if r.isil == "EUR-CURIA0001"][0] assert curia_record.organisation_name == "Court of Justice of the European Union" assert curia_record.city == "Luxembourg" assert curia_record.variants is None or "CJEU" in curia_record.variants or "CJUE" in curia_record.variants # Check EUI record eui_record = [r for r in records if r.isil == "EUR-EUI0001"][0] assert eui_record.organisation_name == "European University Institute" assert eui_record.subunit == "Archives" assert eui_record.city == "Florence" assert eui_record.state == "Italy" def test_parse_real_file(self): """Test parsing the real EU ISIL directory file""" real_file = Path("/Users/kempersc/apps/glam/data/isil/EUR/isil-directory.txt") if not real_file.exists(): pytest.skip("Real EU ISIL file not found") parser = EUIsilParser() records = list(parser.parse_file(real_file)) # Should parse all 10 institutions assert len(records) == 10 # Verify specific institutions isil_codes = {r.isil for r in records} expected_codes = { "EUR-COR0001", "EUR-EP00001", "EUR-GSC0001", "EUR-GSC0002", "EUR-EUI0001", "EUR-EC00001", "EUR-EC00002", "EUR-EESC0001", "EUR-CURIA0001", "EUR-EUI0002" } assert isil_codes == expected_codes def test_to_heritage_custodian(self, sample_eu_text_file): """Test conversion to HeritageCustodian model""" parser = EUIsilParser() records = list(parser.parse_file(sample_eu_text_file)) # Convert first record custodian = parser.to_heritage_custodian(records[0]) assert custodian.name == "European Committee of the Regions" assert str(custodian.institution_type) == 'OFFICIAL_INSTITUTION' # Check identifiers assert len(custodian.identifiers) == 1 assert custodian.identifiers[0].identifier_scheme == "ISIL" assert custodian.identifiers[0].identifier_value == "EUR-COR0001" # Check locations assert len(custodian.locations) == 1 assert custodian.locations[0].city == "Brussels" assert custodian.locations[0].country == "BE" # Check provenance assert str(custodian.provenance.data_source) == 'ISIL_REGISTRY' assert str(custodian.provenance.data_tier) == 'TIER_1_AUTHORITATIVE' def test_institution_type_mapping(self, sample_eu_text_file): """Test that EU institutions are classified correctly""" parser = EUIsilParser() records = list(parser.parse_file(sample_eu_text_file)) for record in records: custodian = parser.to_heritage_custodian(record) # Type should match subunit: # - Library subunits → LIBRARY # - Archive subunits → ARCHIVE # - Other EU institutions → OFFICIAL_INSTITUTION if record.subunit and 'library' in record.subunit.lower(): assert str(custodian.institution_type) == 'LIBRARY' elif record.subunit and 'archive' in record.subunit.lower(): assert str(custodian.institution_type) == 'ARCHIVE' else: assert str(custodian.institution_type) == 'OFFICIAL_INSTITUTION' def test_ghcid_generation(self, sample_eu_text_file): """Test GHCID generation for EU institutions""" parser = EUIsilParser() records = list(parser.parse_file(sample_eu_text_file)) # Convert and check GHCIDs for record in records: custodian = parser.to_heritage_custodian(record) # GHCID should be generated assert custodian.ghcid_current is not None assert custodian.ghcid_uuid is not None assert custodian.ghcid_numeric is not None # GHCID should follow format: CC-RR-CCC-T-XXX # Examples: # - Brussels, Belgium: BE-00-BRU-O-XXX (Official Institution) # - Florence, Italy: IT-00-FLO-A-XXX (Archives) # - Luxembourg: LU-00-LUX-O-XXX (Official Institution) # - Library subunits: XX-00-XXX-L-XXX (Library) ghcid_parts = custodian.ghcid_current.split('-') assert len(ghcid_parts) == 5 assert ghcid_parts[0] in ['BE', 'IT', 'LU'] # Country codes assert ghcid_parts[1] == '00' # Region code for EU institutions assert ghcid_parts[3] in ['O', 'L', 'A'] # Institution type codes def test_space_in_isil_code(self): """Test handling of ISIL codes with spaces (EUR- CURIA0001)""" text_with_space = ''' Cour de justice de l'Union CJEU EUR- CURIA0001 Court of Justice of the rue du Fort Niedergrünewald Grand-Duché de Luxembourg 03-Aug-22 européenne CJUE Luxembourg L-2925 European Union (Grand Duchy of Luxembourg) Curia / CVRIA ''' with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f: f.write(text_with_space) temp_path = f.name try: parser = EUIsilParser() records = list(parser.parse_file(temp_path)) assert len(records) == 1 # Space should be normalized out assert records[0].isil == "EUR-CURIA0001" assert records[0].organisation_name == "Court of Justice of the European Union" finally: os.unlink(temp_path) def test_multi_line_organization_name(self): """Test reconstruction of organization names split across multiple lines""" text_multi_line = ''' EUR-COR0001 European Committee of the Belliard 99-101 06-Jan-17 / CoR Belgium Brussels Regions ''' with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f: f.write(text_multi_line) temp_path = f.name try: parser = EUIsilParser() records = list(parser.parse_file(temp_path)) assert len(records) == 1 # Should reconstruct full name from lines 1 and 3 assert records[0].organisation_name == "European Committee of the Regions" finally: os.unlink(temp_path) def test_empty_file(self): """Test parsing empty file""" with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f: f.write("") temp_path = f.name try: parser = EUIsilParser() records = list(parser.parse_file(temp_path)) assert len(records) == 0 finally: os.unlink(temp_path) def test_country_code_mapping(self, sample_eu_text_file): """Test country code mapping for EU institutions""" parser = EUIsilParser() records = list(parser.parse_file(sample_eu_text_file)) country_mapping = { "Belgium": "BE", "Italy": "IT", "Grand-Duché de Luxembourg": "LU", "Grand Duchy of Luxembourg": "LU" } for record in records: custodian = parser.to_heritage_custodian(record) if custodian.locations: location = custodian.locations[0] if record.state: expected_country = country_mapping.get(record.state) if expected_country: assert location.country == expected_country