glam/tests/extractors/test_identifiers.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

351 lines
13 KiB
Python

"""
Tests for IdentifierExtractor
"""
import pytest
from glam_extractor.extractors.identifiers import IdentifierExtractor
from glam_extractor.models import Identifier
@pytest.fixture
def extractor():
"""Fixture providing an IdentifierExtractor instance"""
return IdentifierExtractor()
class TestISILExtraction:
"""Test ISIL code extraction"""
def test_extract_single_isil_code(self, extractor):
text = "The museum has ISIL code NL-AsdRM."
identifiers = extractor.extract_isil_codes(text)
assert len(identifiers) == 1
assert identifiers[0].identifier_scheme == "ISIL"
assert identifiers[0].identifier_value == "NL-ASDRM" # Normalized to uppercase
def test_extract_multiple_isil_codes(self, extractor):
text = "ISIL codes: NL-AsdRM, NL-HaNA, US-DLC, GB-UkLoNL"
identifiers = extractor.extract_isil_codes(text)
assert len(identifiers) == 4
values = [id.identifier_value for id in identifiers]
assert "NL-ASDRM" in values
assert "NL-HANA" in values
assert "US-DLC" in values
assert "GB-UKLONL" in values
def test_extract_isil_with_mixed_case(self, extractor):
text = "ISIL: nl-asdrm"
identifiers = extractor.extract_isil_codes(text)
assert len(identifiers) == 1
assert identifiers[0].identifier_value == "NL-ASDRM"
def test_invalid_country_code_rejected(self, extractor):
# XX is not a valid country code
text = "Invalid code: XX-12345"
identifiers = extractor.extract_isil_codes(text)
assert len(identifiers) == 0
def test_isil_from_rijksmuseum_sample(self, extractor):
text = "The museum has an ISIL code NL-AsdRM. It is located at Museumstraat 1."
identifiers = extractor.extract_isil_codes(text)
assert len(identifiers) == 1
assert identifiers[0].identifier_value == "NL-ASDRM"
def test_isil_from_nationaal_archief_sample(self, extractor):
text = "The Nationaal Archief (National Archive) is located in The Hague. ISIL: NL-HaNA."
identifiers = extractor.extract_isil_codes(text)
assert len(identifiers) == 1
assert identifiers[0].identifier_value == "NL-HANA"
class TestWikidataExtraction:
"""Test Wikidata ID extraction"""
def test_extract_single_wikidata_id(self, extractor):
text = "The Rijksmuseum is Q190804 on Wikidata."
identifiers = extractor.extract_wikidata_ids(text)
assert len(identifiers) == 1
assert identifiers[0].identifier_scheme == "Wikidata"
assert identifiers[0].identifier_value == "Q190804"
assert "wikidata.org" in str(identifiers[0].identifier_url)
def test_extract_multiple_wikidata_ids(self, extractor):
text = "Museums: Q190804 (Rijksmuseum), Q1526131 (Van Gogh), Q679527 (Stedelijk)"
identifiers = extractor.extract_wikidata_ids(text)
assert len(identifiers) == 3
values = [id.identifier_value for id in identifiers]
assert "Q190804" in values
assert "Q1526131" in values
assert "Q679527" in values
def test_wikidata_url_construction(self, extractor):
text = "Q190804"
identifiers = extractor.extract_wikidata_ids(text)
assert len(identifiers) == 1
assert str(identifiers[0].identifier_url) == "https://www.wikidata.org/wiki/Q190804"
class TestVIAFExtraction:
"""Test VIAF ID extraction"""
def test_extract_viaf_from_url(self, extractor):
text = "See https://viaf.org/viaf/123456789 for authority data."
identifiers = extractor.extract_viaf_ids(text)
assert len(identifiers) == 1
assert identifiers[0].identifier_scheme == "VIAF"
assert identifiers[0].identifier_value == "123456789"
assert "viaf.org" in str(identifiers[0].identifier_url)
def test_extract_multiple_viaf_ids(self, extractor):
text = "VIAF IDs: viaf.org/viaf/111111111 and viaf.org/viaf/222222222"
identifiers = extractor.extract_viaf_ids(text)
assert len(identifiers) == 2
values = [id.identifier_value for id in identifiers]
assert "111111111" in values
assert "222222222" in values
def test_viaf_case_insensitive(self, extractor):
text = "See VIAF.ORG/VIAF/123456789"
identifiers = extractor.extract_viaf_ids(text)
assert len(identifiers) == 1
assert identifiers[0].identifier_value == "123456789"
class TestKvKExtraction:
"""Test Dutch KvK number extraction"""
def test_extract_kvk_with_context(self, extractor):
text = "KvK number: 12345678"
identifiers = extractor.extract_kvk_numbers(text, context_required=True)
assert len(identifiers) == 1
assert identifiers[0].identifier_scheme == "KvK"
assert identifiers[0].identifier_value == "12345678"
def test_extract_kvk_full_name(self, extractor):
text = "Kamer van Koophandel nummer: 87654321"
identifiers = extractor.extract_kvk_numbers(text, context_required=True)
assert len(identifiers) == 1
assert identifiers[0].identifier_value == "87654321"
def test_no_kvk_without_context(self, extractor):
# Random 8-digit number without KvK context should be ignored
text = "The building number is 12345678"
identifiers = extractor.extract_kvk_numbers(text, context_required=True)
assert len(identifiers) == 0
def test_kvk_without_context_requirement(self, extractor):
text = "The number is 12345678"
identifiers = extractor.extract_kvk_numbers(text, context_required=False)
# Should extract even without context when context_required=False
assert len(identifiers) >= 1
def test_reject_year_as_kvk(self, extractor):
# Should not extract years as KvK numbers
text = "KvK: 12345678. Founded in 1995."
identifiers = extractor.extract_kvk_numbers(text, context_required=False)
# Should extract 12345678 but not 1995
values = [id.identifier_value for id in identifiers]
assert "12345678" in values
assert "1995" not in values
class TestURLExtraction:
"""Test URL extraction"""
def test_extract_http_url(self, extractor):
text = "Visit http://www.rijksmuseum.nl for more info."
urls = extractor.extract_urls(text)
assert len(urls) == 1
assert "rijksmuseum.nl" in urls[0]
def test_extract_https_url(self, extractor):
text = "Website: https://www.nationaalarchief.nl"
urls = extractor.extract_urls(text)
assert len(urls) == 1
assert "nationaalarchief.nl" in urls[0]
def test_extract_multiple_urls(self, extractor):
text = "Sites: https://rijksmuseum.nl, https://vangoghmuseum.nl, https://stedelijk.nl"
urls = extractor.extract_urls(text)
assert len(urls) == 3
def test_filter_urls_by_domain(self, extractor):
text = "Museums: https://rijksmuseum.nl, Libraries: https://kb.nl, Archives: https://gahetna.nl"
museum_urls = extractor.extract_urls(text, filter_domains=["museum"])
assert len(museum_urls) == 1
assert "rijksmuseum.nl" in museum_urls[0]
def test_url_with_path(self, extractor):
text = "Collection: https://www.rijksmuseum.nl/en/rijksstudio"
urls = extractor.extract_urls(text)
assert len(urls) == 1
assert "rijksstudio" in urls[0]
class TestExtractAll:
"""Test extracting all identifier types at once"""
def test_extract_all_from_rijksmuseum_text(self, extractor):
text = """
The Rijksmuseum is a Dutch national museum in Amsterdam.
The museum has an ISIL code NL-AsdRM.
It is located at Museumstraat 1, 1071 XX Amsterdam.
The museum holds over 1 million objects in its collection.
Their digital collection is available at https://www.rijksmuseum.nl/en/rijksstudio.
"""
identifiers = extractor.extract_all(text, include_urls=True)
# Should find ISIL code and URL
schemes = {id.identifier_scheme for id in identifiers}
assert "ISIL" in schemes
assert "URL" in schemes
# Check ISIL value
isil_ids = [id for id in identifiers if id.identifier_scheme == "ISIL"]
assert len(isil_ids) == 1
assert isil_ids[0].identifier_value == "NL-ASDRM"
def test_extract_all_from_nationaal_archief_text(self, extractor):
text = """
The Nationaal Archief (National Archive) is located in The Hague, Netherlands.
ISIL: NL-HaNA.
Address: Prins Willem-Alexanderhof 20, 2595 BE Den Haag.
Their catalog is searchable at https://www.nationaalarchief.nl.
"""
identifiers = extractor.extract_all(text, include_urls=True)
schemes = {id.identifier_scheme for id in identifiers}
assert "ISIL" in schemes
assert "URL" in schemes
def test_deduplication(self, extractor):
# Same ISIL code mentioned twice
text = "ISIL code NL-AsdRM. The code NL-AsdRM identifies the museum."
identifiers = extractor.extract_all(text)
# Should only return one instance
isil_ids = [id for id in identifiers if id.identifier_scheme == "ISIL"]
assert len(isil_ids) == 1
def test_mixed_identifiers(self, extractor):
text = """
Rijksmuseum (Q190804) has ISIL code NL-AsdRM.
VIAF: viaf.org/viaf/123456789.
KvK: 12345678.
Website: https://rijksmuseum.nl
"""
identifiers = extractor.extract_all(text, include_urls=True)
schemes = {id.identifier_scheme for id in identifiers}
assert "ISIL" in schemes
assert "Wikidata" in schemes
assert "VIAF" in schemes
assert "KvK" in schemes
assert "URL" in schemes
class TestExtractWithContext:
"""Test extraction with surrounding context"""
def test_extract_with_context_basic(self, extractor):
text = "The Rijksmuseum has ISIL code NL-AsdRM and is located in Amsterdam."
results = extractor.extract_with_context(text, context_window=20)
# Should find NL-AsdRM
assert len(results) > 0
# Find the ISIL code result
isil_result = next((r for r in results if r["identifier"].identifier_scheme == "ISIL"), None)
assert isil_result is not None
# Check context includes nearby words
assert "ISIL code" in isil_result["context_before"] or "code" in isil_result["context_before"]
assert "and is" in isil_result["context_after"] or "and" in isil_result["context_after"]
def test_context_window_size(self, extractor):
text = "Museum ISIL: NL-AsdRM details"
results = extractor.extract_with_context(text, context_window=5)
isil_result = next((r for r in results if r["identifier"].identifier_scheme == "ISIL"), None)
assert isil_result is not None
# With window=5, should get ~5 chars before/after
assert len(isil_result["context_before"]) <= 6 # "SIL: " = 5 chars
assert len(isil_result["context_after"]) <= 6
def test_position_information(self, extractor):
text = "ISIL: NL-AsdRM"
results = extractor.extract_with_context(text, context_window=10)
isil_result = next((r for r in results if r["identifier"].identifier_scheme == "ISIL"), None)
assert isil_result is not None
assert "position" in isil_result
assert isil_result["position"] == text.index("NL-AsdRM")
class TestEdgeCases:
"""Test edge cases and boundary conditions"""
def test_empty_string(self, extractor):
identifiers = extractor.extract_all("")
assert len(identifiers) == 0
def test_no_identifiers(self, extractor):
text = "This is just plain text with no identifiers."
identifiers = extractor.extract_all(text)
assert len(identifiers) == 0
def test_special_characters(self, extractor):
text = "ISIL: NL-AsdRM; other info"
identifiers = extractor.extract_isil_codes(text)
assert len(identifiers) == 1
assert identifiers[0].identifier_value == "NL-ASDRM"
def test_multiline_text(self, extractor):
text = """
Line 1: ISIL NL-AsdRM
Line 2: Wikidata Q190804
Line 3: Website https://rijksmuseum.nl
"""
identifiers = extractor.extract_all(text, include_urls=True)
schemes = {id.identifier_scheme for id in identifiers}
assert "ISIL" in schemes
assert "Wikidata" in schemes
assert "URL" in schemes
def test_unicode_text(self, extractor):
text = "Het Rijksmuseum heeft ISIL-code NL-AsdRM. Bezoek https://rijksmuseum.nl"
identifiers = extractor.extract_all(text, include_urls=True)
assert len(identifiers) >= 2 # ISIL and URL
def test_partial_matches_not_extracted(self, extractor):
# Should not extract partial ISIL codes
text = "Code: XNL-AsdRM is invalid" # Extra X prefix
identifiers = extractor.extract_isil_codes(text)
# Should not match because of the X prefix
assert len([id for id in identifiers if id.identifier_value == "XNL-ASDRM"]) == 0