"""Tests for Global Heritage Custodian Identifier (GHCID) system.""" import pytest from datetime import datetime, timezone from glam_extractor.identifiers.ghcid import ( GHCIDComponents, GHCIDGenerator, GHCIDHistoryEntry, InstitutionType, extract_abbreviation_from_name, ) class TestAbbreviationExtraction: """Test abbreviation extraction from English institution names.""" def test_basic_museum_name(self): """Test basic museum name abbreviation.""" abbr = extract_abbreviation_from_name("State Museum Amsterdam") assert abbr == "SMA" def test_library_with_prepositions(self): """Test library name with stopwords removed.""" abbr = extract_abbreviation_from_name("National Library of the Netherlands") assert abbr == "NLN" def test_modern_art_museum(self): """Test museum with multiple significant words.""" abbr = extract_abbreviation_from_name("Museum of Modern Art") assert abbr == "MMA" def test_single_word_name(self): """Test single-word institution name.""" abbr = extract_abbreviation_from_name("Rijksmuseum") assert abbr == "R" def test_hyphenated_name(self): """Test name with hyphens.""" abbr = extract_abbreviation_from_name("Anne Frank House") assert abbr == "AFH" def test_lowercase_input(self): """Test that lowercase input is handled correctly.""" abbr = extract_abbreviation_from_name("national gallery london") assert abbr == "NGL" def test_multilingual_stopwords(self): """Test filtering of non-English stopwords.""" abbr = extract_abbreviation_from_name("Biblioteca Nacional do Brasil") assert abbr == "BNB" # 'do' is filtered as Portuguese stopword def test_empty_string(self): """Test empty string handling.""" abbr = extract_abbreviation_from_name("") assert abbr == "" def test_only_stopwords(self): """Test name with only stopwords.""" abbr = extract_abbreviation_from_name("The of and") # Stopwords are single-letter after filtering, so they're excluded # But if ANY words remain, we should get some abbreviation # This is an edge case - expecting empty or minimal output assert len(abbr) >= 0 # Just verify it doesn't crash class TestGHCIDComponents: """Test GHCID component validation and formatting.""" def test_valid_components(self): """Test creation of valid GHCID components.""" components = GHCIDComponents( country_code="NL", region_code="NH", city_locode="AMS", institution_type="M", abbreviation="RM" ) assert components.to_string() == "NL-NH-AMS-M-RM" def test_to_numeric_hash(self): """Test numeric hash generation.""" components = GHCIDComponents( country_code="NL", region_code="NH", city_locode="AMS", institution_type="M", abbreviation="RM" ) numeric = components.to_numeric() assert isinstance(numeric, int) assert numeric > 0 assert numeric < 2**64 # 64-bit unsigned integer def test_hash_consistency(self): """Test that same components produce same hash.""" comp1 = GHCIDComponents("NL", "NH", "AMS", "M", "RM") comp2 = GHCIDComponents("NL", "NH", "AMS", "M", "RM") assert comp1.to_numeric() == comp2.to_numeric() def test_hash_uniqueness(self): """Test that different components produce different hashes.""" comp1 = GHCIDComponents("NL", "NH", "AMS", "M", "RM") comp2 = GHCIDComponents("NL", "ZH", "RTM", "M", "BVM") assert comp1.to_numeric() != comp2.to_numeric() def test_validate_valid_components(self): """Test validation of valid components.""" components = GHCIDComponents("NL", "NH", "AMS", "M", "RM") is_valid, error = components.validate() assert is_valid is True assert error is None def test_validate_invalid_country_code(self): """Test validation catches invalid country codes.""" components = GHCIDComponents("NLD", "NH", "AMS", "M", "RM") # 3 chars instead of 2 is_valid, error = components.validate() assert is_valid is False assert "country code" in error.lower() def test_validate_invalid_region_code(self): """Test validation catches invalid region codes.""" components = GHCIDComponents("NL", "NHZH", "AMS", "M", "RM") # Too long is_valid, error = components.validate() assert is_valid is False assert "region code" in error.lower() def test_validate_invalid_city_locode(self): """Test validation catches invalid city LOCODEs.""" components = GHCIDComponents("NL", "NH", "AMST", "M", "RM") # 4 chars instead of 3 is_valid, error = components.validate() assert is_valid is False assert "locode" in error.lower() def test_validate_invalid_institution_type(self): """Test validation catches invalid institution types.""" components = GHCIDComponents("NL", "NH", "AMS", "MUS", "RM") # Too long is_valid, error = components.validate() assert is_valid is False assert "institution type" in error.lower() def test_validate_invalid_abbreviation(self): """Test validation catches invalid abbreviations.""" components = GHCIDComponents("NL", "NH", "AMS", "M", "RIJKSMUSEUM") # Too long is_valid, error = components.validate() assert is_valid is False assert "abbreviation" in error.lower() def test_uppercase_normalization(self): """Test that components are automatically uppercased.""" components = GHCIDComponents( country_code="nl", region_code="nh", city_locode="ams", institution_type="m", abbreviation="rm" ) # Components should be uppercase when created assert components.to_string() == "NL-NH-AMS-M-RM" class TestGHCIDGenerator: """Test GHCID generation.""" def test_generate_rijksmuseum(self): """Test GHCID generation for Rijksmuseum Amsterdam.""" generator = GHCIDGenerator() components = generator.generate( institution_name="Rijksmuseum", english_name="State Museum Amsterdam", institution_type=InstitutionType.MUSEUM, country_code="NL", region_code="NH", city_locode="AMS" ) assert components.country_code == "NL" assert components.region_code == "NH" assert components.city_locode == "AMS" assert components.institution_type == "M" assert components.abbreviation == "SMA" assert components.to_string() == "NL-NH-AMS-M-SMA" def test_generate_national_library(self): """Test GHCID generation for National Library of the Netherlands.""" generator = GHCIDGenerator() components = generator.generate( institution_name="Koninklijke Bibliotheek", english_name="National Library of the Netherlands", institution_type=InstitutionType.LIBRARY, country_code="NL", region_code="ZH", city_locode="HAG" ) assert components.institution_type == "L" assert components.abbreviation == "NLN" assert components.to_string() == "NL-ZH-HAG-L-NLN" def test_generate_brazilian_library(self): """Test GHCID generation for Brazilian institution.""" generator = GHCIDGenerator() components = generator.generate( institution_name="Biblioteca Nacional do Brasil", english_name="National Library of Brazil", institution_type=InstitutionType.LIBRARY, country_code="BR", region_code="RJ", city_locode="RIO" ) assert components.country_code == "BR" assert components.abbreviation == "NLB" assert components.to_string() == "BR-RJ-RIO-L-NLB" def test_generate_with_invalid_components(self): """Test that invalid components raise ValueError.""" generator = GHCIDGenerator() with pytest.raises(ValueError, match="Invalid GHCID"): generator.generate( institution_name="Test Museum", english_name="Test Museum", institution_type=InstitutionType.MUSEUM, country_code="INVALID", # Invalid country code region_code="NH", city_locode="AMS" ) def test_from_isil(self): """Test GHCID generation from ISIL code.""" generator = GHCIDGenerator() components = generator.from_isil( isil_code="NL-AmsRM", english_name="State Museum Amsterdam", institution_type=InstitutionType.MUSEUM, region_code="NH", city_locode="AMS" ) assert components.country_code == "NL" assert components.to_string() == "NL-NH-AMS-M-SMA" def test_from_isil_invalid_format(self): """Test that invalid ISIL format raises ValueError.""" generator = GHCIDGenerator() with pytest.raises(ValueError, match="Invalid ISIL"): generator.from_isil( isil_code="INVALID", english_name="Test", institution_type=InstitutionType.MUSEUM, region_code="NH", city_locode="AMS" ) def test_institution_type_enum(self): """Test that InstitutionType enum works correctly.""" generator = GHCIDGenerator() components = generator.generate( institution_name="Archive", english_name="Test Archive", institution_type=InstitutionType.ARCHIVE, country_code="NL", region_code="NH", city_locode="AMS" ) assert components.institution_type == "A" def test_institution_type_string(self): """Test that string institution type works.""" generator = GHCIDGenerator() components = generator.generate( institution_name="Gallery", english_name="Test Gallery", institution_type="G", # String instead of enum country_code="NL", region_code="NH", city_locode="AMS" ) assert components.institution_type == "G" class TestGHCIDHistory: """Test GHCID history tracking.""" def test_create_history_entry(self): """Test creation of history entry.""" generator = GHCIDGenerator() components = GHCIDComponents("NL", "NH", "AMS", "M", "RM") entry = generator.create_history_entry( components=components, institution_name="Rijksmuseum", location_city="Amsterdam", location_country="Netherlands", reason="Initial identifier" ) assert entry.ghcid == "NL-NH-AMS-M-RM" assert entry.ghcid_numeric == components.to_numeric() assert entry.institution_name == "Rijksmuseum" assert entry.location_city == "Amsterdam" assert entry.valid_to is None # Still current assert entry.reason == "Initial identifier" def test_history_entry_timestamps(self): """Test that timestamps are properly set.""" generator = GHCIDGenerator() components = GHCIDComponents("NL", "NH", "AMS", "M", "RM") valid_from = datetime(2000, 1, 1, tzinfo=timezone.utc) valid_to = datetime(2020, 1, 1, tzinfo=timezone.utc) entry = generator.create_history_entry( components=components, institution_name="Old Name", location_city="Amsterdam", location_country="Netherlands", valid_from=valid_from, valid_to=valid_to, reason="Name change" ) assert entry.valid_from == valid_from assert entry.valid_to == valid_to def test_history_entry_default_timestamp(self): """Test that valid_from defaults to current time.""" generator = GHCIDGenerator() components = GHCIDComponents("NL", "NH", "AMS", "M", "RM") before = datetime.now(timezone.utc) entry = generator.create_history_entry( components=components, institution_name="Test", location_city="Amsterdam", location_country="Netherlands" ) after = datetime.now(timezone.utc) assert before <= entry.valid_from <= after class TestEdgeCases: """Test edge cases and error handling.""" def test_empty_english_name(self): """Test handling of empty English name.""" abbr = extract_abbreviation_from_name("") assert abbr == "" def test_special_characters_in_name(self): """Test handling of special characters.""" # Hyphens are converted to spaces, so "Anne-Frank" → "Anne Frank" abbr = extract_abbreviation_from_name("Anne-Frank Museum (Amsterdam)") assert abbr == "AFMA" # A(nne) F(rank) M(useum) A(msterdam) def test_numeric_in_abbreviation(self): """Test that numbers are preserved in abbreviations.""" # This is a tricky case - should numbers be included? # Current implementation: only extracts first letter of words abbr = extract_abbreviation_from_name("Museum 1944") assert len(abbr) > 0 # Should extract 'M' from Museum def test_very_long_name(self): """Test handling of very long institution names.""" long_name = "The Royal National Museum of Art History and Cultural Heritage of the Netherlands" abbr = extract_abbreviation_from_name(long_name) # Should extract first letter of significant words assert len(abbr) > 0 assert len(abbr) <= 10 # Should be reasonable length class TestCollisionResolution: """Test GHCID collision resolution with Wikidata Q-numbers.""" def test_ghcid_without_qid(self): """Test standard GHCID without collision.""" components = GHCIDComponents( country_code="NL", region_code="NH", city_locode="AMS", institution_type="M", abbreviation="RM" ) # Should not have Q-number assert components.wikidata_qid is None # GHCID should be standard format assert components.to_string() == "NL-NH-AMS-M-RM" def test_ghcid_with_qid(self): """Test GHCID with Wikidata Q-number for collision resolution.""" components = GHCIDComponents( country_code="NL", region_code="NH", city_locode="AMS", institution_type="M", abbreviation="SM", wikidata_qid="Q924335" ) # Q-number should be stored without prefix assert components.wikidata_qid == "924335" # GHCID should include Q-number suffix assert components.to_string() == "NL-NH-AMS-M-SM-Q924335" def test_qid_normalization_with_prefix(self): """Test Q-prefix is stripped during initialization.""" # Test with Q prefix c1 = GHCIDComponents( country_code="NL", region_code="NH", city_locode="AMS", institution_type="M", abbreviation="SM", wikidata_qid="Q924335" ) # Stored without Q assert c1.wikidata_qid == "924335" # But displayed with Q assert c1.to_string() == "NL-NH-AMS-M-SM-Q924335" def test_qid_normalization_without_prefix(self): """Test Q-prefix is not required in input.""" # Test without Q prefix c2 = GHCIDComponents( country_code="NL", region_code="NH", city_locode="AMS", institution_type="M", abbreviation="SM", wikidata_qid="924335" ) # Should produce same result assert c2.wikidata_qid == "924335" assert c2.to_string() == "NL-NH-AMS-M-SM-Q924335" def test_qid_consistency(self): """Test both input formats produce identical results.""" # With Q prefix c1 = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335") # Without Q prefix c2 = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="924335") # Should be identical assert c1.to_string() == c2.to_string() assert c1.to_numeric() == c2.to_numeric() assert c1.wikidata_qid == c2.wikidata_qid def test_collision_example_stedelijk_museum(self): """Test real collision example: Stedelijk Museum Amsterdam.""" stedelijk = GHCIDComponents( country_code="NL", region_code="NH", city_locode="AMS", institution_type="M", abbreviation="SM", wikidata_qid="Q924335" # Real Wikidata ID for Stedelijk Museum ) assert stedelijk.to_string() == "NL-NH-AMS-M-SM-Q924335" # Numeric hash should be different from base GHCID base = GHCIDComponents("NL", "NH", "AMS", "M", "SM") assert stedelijk.to_numeric() != base.to_numeric() def test_collision_example_science_museum(self): """Test hypothetical collision: Science Museum Amsterdam.""" science = GHCIDComponents( country_code="NL", region_code="NH", city_locode="AMS", institution_type="M", abbreviation="SM", wikidata_qid="Q123456" # Hypothetical Q-number ) assert science.to_string() == "NL-NH-AMS-M-SM-Q123456" def test_collision_different_numeric_hashes(self): """Test that colliding GHCIDs with different Q-numbers have different numeric hashes.""" # Two museums in Amsterdam both abbreviated "SM" stedelijk = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335") science = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q123456") base = GHCIDComponents("NL", "NH", "AMS", "M", "SM") # All three should have different numeric hashes assert stedelijk.to_numeric() != science.to_numeric() assert stedelijk.to_numeric() != base.to_numeric() assert science.to_numeric() != base.to_numeric() # But same base format (before Q-number) assert stedelijk.to_string().startswith("NL-NH-AMS-M-SM") assert science.to_string().startswith("NL-NH-AMS-M-SM") assert base.to_string() == "NL-NH-AMS-M-SM" def test_qid_validation_numeric_only(self): """Test Q-number validation accepts only numeric values.""" # Valid: numeric after Q stripped c1 = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335") is_valid, error = c1.validate() assert is_valid assert error is None # Valid: numeric without Q c2 = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="924335") is_valid, error = c2.validate() assert is_valid assert error is None def test_qid_invalid_characters(self): """Test Q-number validation rejects non-numeric values.""" # Invalid: letters after Q stripped c = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="QABC") is_valid, error = c.validate() assert not is_valid assert "Wikidata QID" in error assert "numeric" in error.lower() def test_schema_regex_pattern_without_qid(self): """Test GHCID matches schema regex pattern (without Q-number).""" import re # Pattern from schema (without Q-suffix) pattern = r'^[A-Z]{2}-[A-Z0-9]{1,3}-[A-Z]{3}-[A-Z]-[A-Z0-9]{1,10}$' components = GHCIDComponents("NL", "NH", "AMS", "M", "RM") ghcid = components.to_string() assert re.match(pattern, ghcid) def test_schema_regex_pattern_with_qid(self): """Test GHCID matches schema regex pattern (with Q-number).""" import re # Pattern from schema (with optional Q-suffix) pattern = r'^[A-Z]{2}-[A-Z0-9]{1,3}-[A-Z]{3}-[A-Z]-[A-Z0-9]{1,10}(-Q[0-9]+)?$' components = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335") ghcid = components.to_string() assert re.match(pattern, ghcid) def test_history_entry_with_collision(self): """Test creating history entry for GHCID with collision resolver.""" generator = GHCIDGenerator() components = GHCIDComponents( country_code="NL", region_code="NH", city_locode="AMS", institution_type="M", abbreviation="SM", wikidata_qid="Q924335" ) entry = generator.create_history_entry( components=components, institution_name="Stedelijk Museum", location_city="Amsterdam", location_country="Netherlands", reason="Collision detected - added Wikidata Q-number" ) # Entry should include Q-number in GHCID string assert entry.ghcid == "NL-NH-AMS-M-SM-Q924335" assert "collision" in entry.reason.lower() or "Collision" in entry.reason # Numeric hash should reflect Q-number assert entry.ghcid_numeric == components.to_numeric()