- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
351 lines
13 KiB
Python
351 lines
13 KiB
Python
"""
|
|
Tests for IdentifierExtractor
|
|
"""
|
|
|
|
import pytest
|
|
from glam_extractor.extractors.identifiers import IdentifierExtractor
|
|
from glam_extractor.models import Identifier
|
|
|
|
|
|
@pytest.fixture
|
|
def extractor():
|
|
"""Fixture providing an IdentifierExtractor instance"""
|
|
return IdentifierExtractor()
|
|
|
|
|
|
class TestISILExtraction:
|
|
"""Test ISIL code extraction"""
|
|
|
|
def test_extract_single_isil_code(self, extractor):
|
|
text = "The museum has ISIL code NL-AsdRM."
|
|
identifiers = extractor.extract_isil_codes(text)
|
|
|
|
assert len(identifiers) == 1
|
|
assert identifiers[0].identifier_scheme == "ISIL"
|
|
assert identifiers[0].identifier_value == "NL-ASDRM" # Normalized to uppercase
|
|
|
|
def test_extract_multiple_isil_codes(self, extractor):
|
|
text = "ISIL codes: NL-AsdRM, NL-HaNA, US-DLC, GB-UkLoNL"
|
|
identifiers = extractor.extract_isil_codes(text)
|
|
|
|
assert len(identifiers) == 4
|
|
values = [id.identifier_value for id in identifiers]
|
|
assert "NL-ASDRM" in values
|
|
assert "NL-HANA" in values
|
|
assert "US-DLC" in values
|
|
assert "GB-UKLONL" in values
|
|
|
|
def test_extract_isil_with_mixed_case(self, extractor):
|
|
text = "ISIL: nl-asdrm"
|
|
identifiers = extractor.extract_isil_codes(text)
|
|
|
|
assert len(identifiers) == 1
|
|
assert identifiers[0].identifier_value == "NL-ASDRM"
|
|
|
|
def test_invalid_country_code_rejected(self, extractor):
|
|
# XX is not a valid country code
|
|
text = "Invalid code: XX-12345"
|
|
identifiers = extractor.extract_isil_codes(text)
|
|
|
|
assert len(identifiers) == 0
|
|
|
|
def test_isil_from_rijksmuseum_sample(self, extractor):
|
|
text = "The museum has an ISIL code NL-AsdRM. It is located at Museumstraat 1."
|
|
identifiers = extractor.extract_isil_codes(text)
|
|
|
|
assert len(identifiers) == 1
|
|
assert identifiers[0].identifier_value == "NL-ASDRM"
|
|
|
|
def test_isil_from_nationaal_archief_sample(self, extractor):
|
|
text = "The Nationaal Archief (National Archive) is located in The Hague. ISIL: NL-HaNA."
|
|
identifiers = extractor.extract_isil_codes(text)
|
|
|
|
assert len(identifiers) == 1
|
|
assert identifiers[0].identifier_value == "NL-HANA"
|
|
|
|
|
|
class TestWikidataExtraction:
|
|
"""Test Wikidata ID extraction"""
|
|
|
|
def test_extract_single_wikidata_id(self, extractor):
|
|
text = "The Rijksmuseum is Q190804 on Wikidata."
|
|
identifiers = extractor.extract_wikidata_ids(text)
|
|
|
|
assert len(identifiers) == 1
|
|
assert identifiers[0].identifier_scheme == "Wikidata"
|
|
assert identifiers[0].identifier_value == "Q190804"
|
|
assert "wikidata.org" in str(identifiers[0].identifier_url)
|
|
|
|
def test_extract_multiple_wikidata_ids(self, extractor):
|
|
text = "Museums: Q190804 (Rijksmuseum), Q1526131 (Van Gogh), Q679527 (Stedelijk)"
|
|
identifiers = extractor.extract_wikidata_ids(text)
|
|
|
|
assert len(identifiers) == 3
|
|
values = [id.identifier_value for id in identifiers]
|
|
assert "Q190804" in values
|
|
assert "Q1526131" in values
|
|
assert "Q679527" in values
|
|
|
|
def test_wikidata_url_construction(self, extractor):
|
|
text = "Q190804"
|
|
identifiers = extractor.extract_wikidata_ids(text)
|
|
|
|
assert len(identifiers) == 1
|
|
assert str(identifiers[0].identifier_url) == "https://www.wikidata.org/wiki/Q190804"
|
|
|
|
|
|
class TestVIAFExtraction:
|
|
"""Test VIAF ID extraction"""
|
|
|
|
def test_extract_viaf_from_url(self, extractor):
|
|
text = "See https://viaf.org/viaf/123456789 for authority data."
|
|
identifiers = extractor.extract_viaf_ids(text)
|
|
|
|
assert len(identifiers) == 1
|
|
assert identifiers[0].identifier_scheme == "VIAF"
|
|
assert identifiers[0].identifier_value == "123456789"
|
|
assert "viaf.org" in str(identifiers[0].identifier_url)
|
|
|
|
def test_extract_multiple_viaf_ids(self, extractor):
|
|
text = "VIAF IDs: viaf.org/viaf/111111111 and viaf.org/viaf/222222222"
|
|
identifiers = extractor.extract_viaf_ids(text)
|
|
|
|
assert len(identifiers) == 2
|
|
values = [id.identifier_value for id in identifiers]
|
|
assert "111111111" in values
|
|
assert "222222222" in values
|
|
|
|
def test_viaf_case_insensitive(self, extractor):
|
|
text = "See VIAF.ORG/VIAF/123456789"
|
|
identifiers = extractor.extract_viaf_ids(text)
|
|
|
|
assert len(identifiers) == 1
|
|
assert identifiers[0].identifier_value == "123456789"
|
|
|
|
|
|
class TestKvKExtraction:
|
|
"""Test Dutch KvK number extraction"""
|
|
|
|
def test_extract_kvk_with_context(self, extractor):
|
|
text = "KvK number: 12345678"
|
|
identifiers = extractor.extract_kvk_numbers(text, context_required=True)
|
|
|
|
assert len(identifiers) == 1
|
|
assert identifiers[0].identifier_scheme == "KvK"
|
|
assert identifiers[0].identifier_value == "12345678"
|
|
|
|
def test_extract_kvk_full_name(self, extractor):
|
|
text = "Kamer van Koophandel nummer: 87654321"
|
|
identifiers = extractor.extract_kvk_numbers(text, context_required=True)
|
|
|
|
assert len(identifiers) == 1
|
|
assert identifiers[0].identifier_value == "87654321"
|
|
|
|
def test_no_kvk_without_context(self, extractor):
|
|
# Random 8-digit number without KvK context should be ignored
|
|
text = "The building number is 12345678"
|
|
identifiers = extractor.extract_kvk_numbers(text, context_required=True)
|
|
|
|
assert len(identifiers) == 0
|
|
|
|
def test_kvk_without_context_requirement(self, extractor):
|
|
text = "The number is 12345678"
|
|
identifiers = extractor.extract_kvk_numbers(text, context_required=False)
|
|
|
|
# Should extract even without context when context_required=False
|
|
assert len(identifiers) >= 1
|
|
|
|
def test_reject_year_as_kvk(self, extractor):
|
|
# Should not extract years as KvK numbers
|
|
text = "KvK: 12345678. Founded in 1995."
|
|
identifiers = extractor.extract_kvk_numbers(text, context_required=False)
|
|
|
|
# Should extract 12345678 but not 1995
|
|
values = [id.identifier_value for id in identifiers]
|
|
assert "12345678" in values
|
|
assert "1995" not in values
|
|
|
|
|
|
class TestURLExtraction:
|
|
"""Test URL extraction"""
|
|
|
|
def test_extract_http_url(self, extractor):
|
|
text = "Visit http://www.rijksmuseum.nl for more info."
|
|
urls = extractor.extract_urls(text)
|
|
|
|
assert len(urls) == 1
|
|
assert "rijksmuseum.nl" in urls[0]
|
|
|
|
def test_extract_https_url(self, extractor):
|
|
text = "Website: https://www.nationaalarchief.nl"
|
|
urls = extractor.extract_urls(text)
|
|
|
|
assert len(urls) == 1
|
|
assert "nationaalarchief.nl" in urls[0]
|
|
|
|
def test_extract_multiple_urls(self, extractor):
|
|
text = "Sites: https://rijksmuseum.nl, https://vangoghmuseum.nl, https://stedelijk.nl"
|
|
urls = extractor.extract_urls(text)
|
|
|
|
assert len(urls) == 3
|
|
|
|
def test_filter_urls_by_domain(self, extractor):
|
|
text = "Museums: https://rijksmuseum.nl, Libraries: https://kb.nl, Archives: https://gahetna.nl"
|
|
museum_urls = extractor.extract_urls(text, filter_domains=["museum"])
|
|
|
|
assert len(museum_urls) == 1
|
|
assert "rijksmuseum.nl" in museum_urls[0]
|
|
|
|
def test_url_with_path(self, extractor):
|
|
text = "Collection: https://www.rijksmuseum.nl/en/rijksstudio"
|
|
urls = extractor.extract_urls(text)
|
|
|
|
assert len(urls) == 1
|
|
assert "rijksstudio" in urls[0]
|
|
|
|
|
|
class TestExtractAll:
|
|
"""Test extracting all identifier types at once"""
|
|
|
|
def test_extract_all_from_rijksmuseum_text(self, extractor):
|
|
text = """
|
|
The Rijksmuseum is a Dutch national museum in Amsterdam.
|
|
The museum has an ISIL code NL-AsdRM.
|
|
It is located at Museumstraat 1, 1071 XX Amsterdam.
|
|
The museum holds over 1 million objects in its collection.
|
|
Their digital collection is available at https://www.rijksmuseum.nl/en/rijksstudio.
|
|
"""
|
|
identifiers = extractor.extract_all(text, include_urls=True)
|
|
|
|
# Should find ISIL code and URL
|
|
schemes = {id.identifier_scheme for id in identifiers}
|
|
assert "ISIL" in schemes
|
|
assert "URL" in schemes
|
|
|
|
# Check ISIL value
|
|
isil_ids = [id for id in identifiers if id.identifier_scheme == "ISIL"]
|
|
assert len(isil_ids) == 1
|
|
assert isil_ids[0].identifier_value == "NL-ASDRM"
|
|
|
|
def test_extract_all_from_nationaal_archief_text(self, extractor):
|
|
text = """
|
|
The Nationaal Archief (National Archive) is located in The Hague, Netherlands.
|
|
ISIL: NL-HaNA.
|
|
Address: Prins Willem-Alexanderhof 20, 2595 BE Den Haag.
|
|
Their catalog is searchable at https://www.nationaalarchief.nl.
|
|
"""
|
|
identifiers = extractor.extract_all(text, include_urls=True)
|
|
|
|
schemes = {id.identifier_scheme for id in identifiers}
|
|
assert "ISIL" in schemes
|
|
assert "URL" in schemes
|
|
|
|
def test_deduplication(self, extractor):
|
|
# Same ISIL code mentioned twice
|
|
text = "ISIL code NL-AsdRM. The code NL-AsdRM identifies the museum."
|
|
identifiers = extractor.extract_all(text)
|
|
|
|
# Should only return one instance
|
|
isil_ids = [id for id in identifiers if id.identifier_scheme == "ISIL"]
|
|
assert len(isil_ids) == 1
|
|
|
|
def test_mixed_identifiers(self, extractor):
|
|
text = """
|
|
Rijksmuseum (Q190804) has ISIL code NL-AsdRM.
|
|
VIAF: viaf.org/viaf/123456789.
|
|
KvK: 12345678.
|
|
Website: https://rijksmuseum.nl
|
|
"""
|
|
identifiers = extractor.extract_all(text, include_urls=True)
|
|
|
|
schemes = {id.identifier_scheme for id in identifiers}
|
|
assert "ISIL" in schemes
|
|
assert "Wikidata" in schemes
|
|
assert "VIAF" in schemes
|
|
assert "KvK" in schemes
|
|
assert "URL" in schemes
|
|
|
|
|
|
class TestExtractWithContext:
|
|
"""Test extraction with surrounding context"""
|
|
|
|
def test_extract_with_context_basic(self, extractor):
|
|
text = "The Rijksmuseum has ISIL code NL-AsdRM and is located in Amsterdam."
|
|
results = extractor.extract_with_context(text, context_window=20)
|
|
|
|
# Should find NL-AsdRM
|
|
assert len(results) > 0
|
|
|
|
# Find the ISIL code result
|
|
isil_result = next((r for r in results if r["identifier"].identifier_scheme == "ISIL"), None)
|
|
assert isil_result is not None
|
|
|
|
# Check context includes nearby words
|
|
assert "ISIL code" in isil_result["context_before"] or "code" in isil_result["context_before"]
|
|
assert "and is" in isil_result["context_after"] or "and" in isil_result["context_after"]
|
|
|
|
def test_context_window_size(self, extractor):
|
|
text = "Museum ISIL: NL-AsdRM details"
|
|
results = extractor.extract_with_context(text, context_window=5)
|
|
|
|
isil_result = next((r for r in results if r["identifier"].identifier_scheme == "ISIL"), None)
|
|
assert isil_result is not None
|
|
|
|
# With window=5, should get ~5 chars before/after
|
|
assert len(isil_result["context_before"]) <= 6 # "SIL: " = 5 chars
|
|
assert len(isil_result["context_after"]) <= 6
|
|
|
|
def test_position_information(self, extractor):
|
|
text = "ISIL: NL-AsdRM"
|
|
results = extractor.extract_with_context(text, context_window=10)
|
|
|
|
isil_result = next((r for r in results if r["identifier"].identifier_scheme == "ISIL"), None)
|
|
assert isil_result is not None
|
|
assert "position" in isil_result
|
|
assert isil_result["position"] == text.index("NL-AsdRM")
|
|
|
|
|
|
class TestEdgeCases:
|
|
"""Test edge cases and boundary conditions"""
|
|
|
|
def test_empty_string(self, extractor):
|
|
identifiers = extractor.extract_all("")
|
|
assert len(identifiers) == 0
|
|
|
|
def test_no_identifiers(self, extractor):
|
|
text = "This is just plain text with no identifiers."
|
|
identifiers = extractor.extract_all(text)
|
|
assert len(identifiers) == 0
|
|
|
|
def test_special_characters(self, extractor):
|
|
text = "ISIL: NL-AsdRM; other info"
|
|
identifiers = extractor.extract_isil_codes(text)
|
|
assert len(identifiers) == 1
|
|
assert identifiers[0].identifier_value == "NL-ASDRM"
|
|
|
|
def test_multiline_text(self, extractor):
|
|
text = """
|
|
Line 1: ISIL NL-AsdRM
|
|
Line 2: Wikidata Q190804
|
|
Line 3: Website https://rijksmuseum.nl
|
|
"""
|
|
identifiers = extractor.extract_all(text, include_urls=True)
|
|
|
|
schemes = {id.identifier_scheme for id in identifiers}
|
|
assert "ISIL" in schemes
|
|
assert "Wikidata" in schemes
|
|
assert "URL" in schemes
|
|
|
|
def test_unicode_text(self, extractor):
|
|
text = "Het Rijksmuseum heeft ISIL-code NL-AsdRM. Bezoek https://rijksmuseum.nl"
|
|
identifiers = extractor.extract_all(text, include_urls=True)
|
|
|
|
assert len(identifiers) >= 2 # ISIL and URL
|
|
|
|
def test_partial_matches_not_extracted(self, extractor):
|
|
# Should not extract partial ISIL codes
|
|
text = "Code: XNL-AsdRM is invalid" # Extra X prefix
|
|
identifiers = extractor.extract_isil_codes(text)
|
|
|
|
# Should not match because of the X prefix
|
|
assert len([id for id in identifiers if id.identifier_value == "XNL-ASDRM"]) == 0
|