glam/tests/identifiers/test_ghcid.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

598 lines
22 KiB
Python

"""Tests for Global Heritage Custodian Identifier (GHCID) system."""
import pytest
from datetime import datetime, timezone
from glam_extractor.identifiers.ghcid import (
GHCIDComponents,
GHCIDGenerator,
GHCIDHistoryEntry,
InstitutionType,
extract_abbreviation_from_name,
)
class TestAbbreviationExtraction:
"""Test abbreviation extraction from English institution names."""
def test_basic_museum_name(self):
"""Test basic museum name abbreviation."""
abbr = extract_abbreviation_from_name("State Museum Amsterdam")
assert abbr == "SMA"
def test_library_with_prepositions(self):
"""Test library name with stopwords removed."""
abbr = extract_abbreviation_from_name("National Library of the Netherlands")
assert abbr == "NLN"
def test_modern_art_museum(self):
"""Test museum with multiple significant words."""
abbr = extract_abbreviation_from_name("Museum of Modern Art")
assert abbr == "MMA"
def test_single_word_name(self):
"""Test single-word institution name."""
abbr = extract_abbreviation_from_name("Rijksmuseum")
assert abbr == "R"
def test_hyphenated_name(self):
"""Test name with hyphens."""
abbr = extract_abbreviation_from_name("Anne Frank House")
assert abbr == "AFH"
def test_lowercase_input(self):
"""Test that lowercase input is handled correctly."""
abbr = extract_abbreviation_from_name("national gallery london")
assert abbr == "NGL"
def test_multilingual_stopwords(self):
"""Test filtering of non-English stopwords."""
abbr = extract_abbreviation_from_name("Biblioteca Nacional do Brasil")
assert abbr == "BNB" # 'do' is filtered as Portuguese stopword
def test_empty_string(self):
"""Test empty string handling."""
abbr = extract_abbreviation_from_name("")
assert abbr == ""
def test_only_stopwords(self):
"""Test name with only stopwords."""
abbr = extract_abbreviation_from_name("The of and")
# Stopwords are single-letter after filtering, so they're excluded
# But if ANY words remain, we should get some abbreviation
# This is an edge case - expecting empty or minimal output
assert len(abbr) >= 0 # Just verify it doesn't crash
class TestGHCIDComponents:
"""Test GHCID component validation and formatting."""
def test_valid_components(self):
"""Test creation of valid GHCID components."""
components = GHCIDComponents(
country_code="NL",
region_code="NH",
city_locode="AMS",
institution_type="M",
abbreviation="RM"
)
assert components.to_string() == "NL-NH-AMS-M-RM"
def test_to_numeric_hash(self):
"""Test numeric hash generation."""
components = GHCIDComponents(
country_code="NL",
region_code="NH",
city_locode="AMS",
institution_type="M",
abbreviation="RM"
)
numeric = components.to_numeric()
assert isinstance(numeric, int)
assert numeric > 0
assert numeric < 2**64 # 64-bit unsigned integer
def test_hash_consistency(self):
"""Test that same components produce same hash."""
comp1 = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
comp2 = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
assert comp1.to_numeric() == comp2.to_numeric()
def test_hash_uniqueness(self):
"""Test that different components produce different hashes."""
comp1 = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
comp2 = GHCIDComponents("NL", "ZH", "RTM", "M", "BVM")
assert comp1.to_numeric() != comp2.to_numeric()
def test_validate_valid_components(self):
"""Test validation of valid components."""
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
is_valid, error = components.validate()
assert is_valid is True
assert error is None
def test_validate_invalid_country_code(self):
"""Test validation catches invalid country codes."""
components = GHCIDComponents("NLD", "NH", "AMS", "M", "RM") # 3 chars instead of 2
is_valid, error = components.validate()
assert is_valid is False
assert "country code" in error.lower()
def test_validate_invalid_region_code(self):
"""Test validation catches invalid region codes."""
components = GHCIDComponents("NL", "NHZH", "AMS", "M", "RM") # Too long
is_valid, error = components.validate()
assert is_valid is False
assert "region code" in error.lower()
def test_validate_invalid_city_locode(self):
"""Test validation catches invalid city LOCODEs."""
components = GHCIDComponents("NL", "NH", "AMST", "M", "RM") # 4 chars instead of 3
is_valid, error = components.validate()
assert is_valid is False
assert "locode" in error.lower()
def test_validate_invalid_institution_type(self):
"""Test validation catches invalid institution types."""
components = GHCIDComponents("NL", "NH", "AMS", "MUS", "RM") # Too long
is_valid, error = components.validate()
assert is_valid is False
assert "institution type" in error.lower()
def test_validate_invalid_abbreviation(self):
"""Test validation catches invalid abbreviations."""
components = GHCIDComponents("NL", "NH", "AMS", "M", "RIJKSMUSEUM") # Too long
is_valid, error = components.validate()
assert is_valid is False
assert "abbreviation" in error.lower()
def test_uppercase_normalization(self):
"""Test that components are automatically uppercased."""
components = GHCIDComponents(
country_code="nl",
region_code="nh",
city_locode="ams",
institution_type="m",
abbreviation="rm"
)
# Components should be uppercase when created
assert components.to_string() == "NL-NH-AMS-M-RM"
class TestGHCIDGenerator:
"""Test GHCID generation."""
def test_generate_rijksmuseum(self):
"""Test GHCID generation for Rijksmuseum Amsterdam."""
generator = GHCIDGenerator()
components = generator.generate(
institution_name="Rijksmuseum",
english_name="State Museum Amsterdam",
institution_type=InstitutionType.MUSEUM,
country_code="NL",
region_code="NH",
city_locode="AMS"
)
assert components.country_code == "NL"
assert components.region_code == "NH"
assert components.city_locode == "AMS"
assert components.institution_type == "M"
assert components.abbreviation == "SMA"
assert components.to_string() == "NL-NH-AMS-M-SMA"
def test_generate_national_library(self):
"""Test GHCID generation for National Library of the Netherlands."""
generator = GHCIDGenerator()
components = generator.generate(
institution_name="Koninklijke Bibliotheek",
english_name="National Library of the Netherlands",
institution_type=InstitutionType.LIBRARY,
country_code="NL",
region_code="ZH",
city_locode="HAG"
)
assert components.institution_type == "L"
assert components.abbreviation == "NLN"
assert components.to_string() == "NL-ZH-HAG-L-NLN"
def test_generate_brazilian_library(self):
"""Test GHCID generation for Brazilian institution."""
generator = GHCIDGenerator()
components = generator.generate(
institution_name="Biblioteca Nacional do Brasil",
english_name="National Library of Brazil",
institution_type=InstitutionType.LIBRARY,
country_code="BR",
region_code="RJ",
city_locode="RIO"
)
assert components.country_code == "BR"
assert components.abbreviation == "NLB"
assert components.to_string() == "BR-RJ-RIO-L-NLB"
def test_generate_with_invalid_components(self):
"""Test that invalid components raise ValueError."""
generator = GHCIDGenerator()
with pytest.raises(ValueError, match="Invalid GHCID"):
generator.generate(
institution_name="Test Museum",
english_name="Test Museum",
institution_type=InstitutionType.MUSEUM,
country_code="INVALID", # Invalid country code
region_code="NH",
city_locode="AMS"
)
def test_from_isil(self):
"""Test GHCID generation from ISIL code."""
generator = GHCIDGenerator()
components = generator.from_isil(
isil_code="NL-AmsRM",
english_name="State Museum Amsterdam",
institution_type=InstitutionType.MUSEUM,
region_code="NH",
city_locode="AMS"
)
assert components.country_code == "NL"
assert components.to_string() == "NL-NH-AMS-M-SMA"
def test_from_isil_invalid_format(self):
"""Test that invalid ISIL format raises ValueError."""
generator = GHCIDGenerator()
with pytest.raises(ValueError, match="Invalid ISIL"):
generator.from_isil(
isil_code="INVALID",
english_name="Test",
institution_type=InstitutionType.MUSEUM,
region_code="NH",
city_locode="AMS"
)
def test_institution_type_enum(self):
"""Test that InstitutionType enum works correctly."""
generator = GHCIDGenerator()
components = generator.generate(
institution_name="Archive",
english_name="Test Archive",
institution_type=InstitutionType.ARCHIVE,
country_code="NL",
region_code="NH",
city_locode="AMS"
)
assert components.institution_type == "A"
def test_institution_type_string(self):
"""Test that string institution type works."""
generator = GHCIDGenerator()
components = generator.generate(
institution_name="Gallery",
english_name="Test Gallery",
institution_type="G", # String instead of enum
country_code="NL",
region_code="NH",
city_locode="AMS"
)
assert components.institution_type == "G"
class TestGHCIDHistory:
"""Test GHCID history tracking."""
def test_create_history_entry(self):
"""Test creation of history entry."""
generator = GHCIDGenerator()
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
entry = generator.create_history_entry(
components=components,
institution_name="Rijksmuseum",
location_city="Amsterdam",
location_country="Netherlands",
reason="Initial identifier"
)
assert entry.ghcid == "NL-NH-AMS-M-RM"
assert entry.ghcid_numeric == components.to_numeric()
assert entry.institution_name == "Rijksmuseum"
assert entry.location_city == "Amsterdam"
assert entry.valid_to is None # Still current
assert entry.reason == "Initial identifier"
def test_history_entry_timestamps(self):
"""Test that timestamps are properly set."""
generator = GHCIDGenerator()
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
valid_from = datetime(2000, 1, 1, tzinfo=timezone.utc)
valid_to = datetime(2020, 1, 1, tzinfo=timezone.utc)
entry = generator.create_history_entry(
components=components,
institution_name="Old Name",
location_city="Amsterdam",
location_country="Netherlands",
valid_from=valid_from,
valid_to=valid_to,
reason="Name change"
)
assert entry.valid_from == valid_from
assert entry.valid_to == valid_to
def test_history_entry_default_timestamp(self):
"""Test that valid_from defaults to current time."""
generator = GHCIDGenerator()
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
before = datetime.now(timezone.utc)
entry = generator.create_history_entry(
components=components,
institution_name="Test",
location_city="Amsterdam",
location_country="Netherlands"
)
after = datetime.now(timezone.utc)
assert before <= entry.valid_from <= after
class TestEdgeCases:
"""Test edge cases and error handling."""
def test_empty_english_name(self):
"""Test handling of empty English name."""
abbr = extract_abbreviation_from_name("")
assert abbr == ""
def test_special_characters_in_name(self):
"""Test handling of special characters."""
# Hyphens are converted to spaces, so "Anne-Frank" → "Anne Frank"
abbr = extract_abbreviation_from_name("Anne-Frank Museum (Amsterdam)")
assert abbr == "AFMA" # A(nne) F(rank) M(useum) A(msterdam)
def test_numeric_in_abbreviation(self):
"""Test that numbers are preserved in abbreviations."""
# This is a tricky case - should numbers be included?
# Current implementation: only extracts first letter of words
abbr = extract_abbreviation_from_name("Museum 1944")
assert len(abbr) > 0 # Should extract 'M' from Museum
def test_very_long_name(self):
"""Test handling of very long institution names."""
long_name = "The Royal National Museum of Art History and Cultural Heritage of the Netherlands"
abbr = extract_abbreviation_from_name(long_name)
# Should extract first letter of significant words
assert len(abbr) > 0
assert len(abbr) <= 10 # Should be reasonable length
class TestCollisionResolution:
"""Test GHCID collision resolution with Wikidata Q-numbers."""
def test_ghcid_without_qid(self):
"""Test standard GHCID without collision."""
components = GHCIDComponents(
country_code="NL",
region_code="NH",
city_locode="AMS",
institution_type="M",
abbreviation="RM"
)
# Should not have Q-number
assert components.wikidata_qid is None
# GHCID should be standard format
assert components.to_string() == "NL-NH-AMS-M-RM"
def test_ghcid_with_qid(self):
"""Test GHCID with Wikidata Q-number for collision resolution."""
components = GHCIDComponents(
country_code="NL",
region_code="NH",
city_locode="AMS",
institution_type="M",
abbreviation="SM",
wikidata_qid="Q924335"
)
# Q-number should be stored without prefix
assert components.wikidata_qid == "924335"
# GHCID should include Q-number suffix
assert components.to_string() == "NL-NH-AMS-M-SM-Q924335"
def test_qid_normalization_with_prefix(self):
"""Test Q-prefix is stripped during initialization."""
# Test with Q prefix
c1 = GHCIDComponents(
country_code="NL",
region_code="NH",
city_locode="AMS",
institution_type="M",
abbreviation="SM",
wikidata_qid="Q924335"
)
# Stored without Q
assert c1.wikidata_qid == "924335"
# But displayed with Q
assert c1.to_string() == "NL-NH-AMS-M-SM-Q924335"
def test_qid_normalization_without_prefix(self):
"""Test Q-prefix is not required in input."""
# Test without Q prefix
c2 = GHCIDComponents(
country_code="NL",
region_code="NH",
city_locode="AMS",
institution_type="M",
abbreviation="SM",
wikidata_qid="924335"
)
# Should produce same result
assert c2.wikidata_qid == "924335"
assert c2.to_string() == "NL-NH-AMS-M-SM-Q924335"
def test_qid_consistency(self):
"""Test both input formats produce identical results."""
# With Q prefix
c1 = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335")
# Without Q prefix
c2 = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="924335")
# Should be identical
assert c1.to_string() == c2.to_string()
assert c1.to_numeric() == c2.to_numeric()
assert c1.wikidata_qid == c2.wikidata_qid
def test_collision_example_stedelijk_museum(self):
"""Test real collision example: Stedelijk Museum Amsterdam."""
stedelijk = GHCIDComponents(
country_code="NL",
region_code="NH",
city_locode="AMS",
institution_type="M",
abbreviation="SM",
wikidata_qid="Q924335" # Real Wikidata ID for Stedelijk Museum
)
assert stedelijk.to_string() == "NL-NH-AMS-M-SM-Q924335"
# Numeric hash should be different from base GHCID
base = GHCIDComponents("NL", "NH", "AMS", "M", "SM")
assert stedelijk.to_numeric() != base.to_numeric()
def test_collision_example_science_museum(self):
"""Test hypothetical collision: Science Museum Amsterdam."""
science = GHCIDComponents(
country_code="NL",
region_code="NH",
city_locode="AMS",
institution_type="M",
abbreviation="SM",
wikidata_qid="Q123456" # Hypothetical Q-number
)
assert science.to_string() == "NL-NH-AMS-M-SM-Q123456"
def test_collision_different_numeric_hashes(self):
"""Test that colliding GHCIDs with different Q-numbers have different numeric hashes."""
# Two museums in Amsterdam both abbreviated "SM"
stedelijk = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335")
science = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q123456")
base = GHCIDComponents("NL", "NH", "AMS", "M", "SM")
# All three should have different numeric hashes
assert stedelijk.to_numeric() != science.to_numeric()
assert stedelijk.to_numeric() != base.to_numeric()
assert science.to_numeric() != base.to_numeric()
# But same base format (before Q-number)
assert stedelijk.to_string().startswith("NL-NH-AMS-M-SM")
assert science.to_string().startswith("NL-NH-AMS-M-SM")
assert base.to_string() == "NL-NH-AMS-M-SM"
def test_qid_validation_numeric_only(self):
"""Test Q-number validation accepts only numeric values."""
# Valid: numeric after Q stripped
c1 = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335")
is_valid, error = c1.validate()
assert is_valid
assert error is None
# Valid: numeric without Q
c2 = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="924335")
is_valid, error = c2.validate()
assert is_valid
assert error is None
def test_qid_invalid_characters(self):
"""Test Q-number validation rejects non-numeric values."""
# Invalid: letters after Q stripped
c = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="QABC")
is_valid, error = c.validate()
assert not is_valid
assert "Wikidata QID" in error
assert "numeric" in error.lower()
def test_schema_regex_pattern_without_qid(self):
"""Test GHCID matches schema regex pattern (without Q-number)."""
import re
# Pattern from schema (without Q-suffix)
pattern = r'^[A-Z]{2}-[A-Z0-9]{1,3}-[A-Z]{3}-[A-Z]-[A-Z0-9]{1,10}$'
components = GHCIDComponents("NL", "NH", "AMS", "M", "RM")
ghcid = components.to_string()
assert re.match(pattern, ghcid)
def test_schema_regex_pattern_with_qid(self):
"""Test GHCID matches schema regex pattern (with Q-number)."""
import re
# Pattern from schema (with optional Q-suffix)
pattern = r'^[A-Z]{2}-[A-Z0-9]{1,3}-[A-Z]{3}-[A-Z]-[A-Z0-9]{1,10}(-Q[0-9]+)?$'
components = GHCIDComponents("NL", "NH", "AMS", "M", "SM", wikidata_qid="Q924335")
ghcid = components.to_string()
assert re.match(pattern, ghcid)
def test_history_entry_with_collision(self):
"""Test creating history entry for GHCID with collision resolver."""
generator = GHCIDGenerator()
components = GHCIDComponents(
country_code="NL",
region_code="NH",
city_locode="AMS",
institution_type="M",
abbreviation="SM",
wikidata_qid="Q924335"
)
entry = generator.create_history_entry(
components=components,
institution_name="Stedelijk Museum",
location_city="Amsterdam",
location_country="Netherlands",
reason="Collision detected - added Wikidata Q-number"
)
# Entry should include Q-number in GHCID string
assert entry.ghcid == "NL-NH-AMS-M-SM-Q924335"
assert "collision" in entry.reason.lower() or "Collision" in entry.reason
# Numeric hash should reflect Q-number
assert entry.ghcid_numeric == components.to_numeric()