""" Tests for IdentifierExtractor """ import pytest from glam_extractor.extractors.identifiers import IdentifierExtractor from glam_extractor.models import Identifier @pytest.fixture def extractor(): """Fixture providing an IdentifierExtractor instance""" return IdentifierExtractor() class TestISILExtraction: """Test ISIL code extraction""" def test_extract_single_isil_code(self, extractor): text = "The museum has ISIL code NL-AsdRM." identifiers = extractor.extract_isil_codes(text) assert len(identifiers) == 1 assert identifiers[0].identifier_scheme == "ISIL" assert identifiers[0].identifier_value == "NL-ASDRM" # Normalized to uppercase def test_extract_multiple_isil_codes(self, extractor): text = "ISIL codes: NL-AsdRM, NL-HaNA, US-DLC, GB-UkLoNL" identifiers = extractor.extract_isil_codes(text) assert len(identifiers) == 4 values = [id.identifier_value for id in identifiers] assert "NL-ASDRM" in values assert "NL-HANA" in values assert "US-DLC" in values assert "GB-UKLONL" in values def test_extract_isil_with_mixed_case(self, extractor): text = "ISIL: nl-asdrm" identifiers = extractor.extract_isil_codes(text) assert len(identifiers) == 1 assert identifiers[0].identifier_value == "NL-ASDRM" def test_invalid_country_code_rejected(self, extractor): # XX is not a valid country code text = "Invalid code: XX-12345" identifiers = extractor.extract_isil_codes(text) assert len(identifiers) == 0 def test_isil_from_rijksmuseum_sample(self, extractor): text = "The museum has an ISIL code NL-AsdRM. It is located at Museumstraat 1." identifiers = extractor.extract_isil_codes(text) assert len(identifiers) == 1 assert identifiers[0].identifier_value == "NL-ASDRM" def test_isil_from_nationaal_archief_sample(self, extractor): text = "The Nationaal Archief (National Archive) is located in The Hague. ISIL: NL-HaNA." identifiers = extractor.extract_isil_codes(text) assert len(identifiers) == 1 assert identifiers[0].identifier_value == "NL-HANA" class TestWikidataExtraction: """Test Wikidata ID extraction""" def test_extract_single_wikidata_id(self, extractor): text = "The Rijksmuseum is Q190804 on Wikidata." identifiers = extractor.extract_wikidata_ids(text) assert len(identifiers) == 1 assert identifiers[0].identifier_scheme == "Wikidata" assert identifiers[0].identifier_value == "Q190804" assert "wikidata.org" in str(identifiers[0].identifier_url) def test_extract_multiple_wikidata_ids(self, extractor): text = "Museums: Q190804 (Rijksmuseum), Q1526131 (Van Gogh), Q679527 (Stedelijk)" identifiers = extractor.extract_wikidata_ids(text) assert len(identifiers) == 3 values = [id.identifier_value for id in identifiers] assert "Q190804" in values assert "Q1526131" in values assert "Q679527" in values def test_wikidata_url_construction(self, extractor): text = "Q190804" identifiers = extractor.extract_wikidata_ids(text) assert len(identifiers) == 1 assert str(identifiers[0].identifier_url) == "https://www.wikidata.org/wiki/Q190804" class TestVIAFExtraction: """Test VIAF ID extraction""" def test_extract_viaf_from_url(self, extractor): text = "See https://viaf.org/viaf/123456789 for authority data." identifiers = extractor.extract_viaf_ids(text) assert len(identifiers) == 1 assert identifiers[0].identifier_scheme == "VIAF" assert identifiers[0].identifier_value == "123456789" assert "viaf.org" in str(identifiers[0].identifier_url) def test_extract_multiple_viaf_ids(self, extractor): text = "VIAF IDs: viaf.org/viaf/111111111 and viaf.org/viaf/222222222" identifiers = extractor.extract_viaf_ids(text) assert len(identifiers) == 2 values = [id.identifier_value for id in identifiers] assert "111111111" in values assert "222222222" in values def test_viaf_case_insensitive(self, extractor): text = "See VIAF.ORG/VIAF/123456789" identifiers = extractor.extract_viaf_ids(text) assert len(identifiers) == 1 assert identifiers[0].identifier_value == "123456789" class TestKvKExtraction: """Test Dutch KvK number extraction""" def test_extract_kvk_with_context(self, extractor): text = "KvK number: 12345678" identifiers = extractor.extract_kvk_numbers(text, context_required=True) assert len(identifiers) == 1 assert identifiers[0].identifier_scheme == "KvK" assert identifiers[0].identifier_value == "12345678" def test_extract_kvk_full_name(self, extractor): text = "Kamer van Koophandel nummer: 87654321" identifiers = extractor.extract_kvk_numbers(text, context_required=True) assert len(identifiers) == 1 assert identifiers[0].identifier_value == "87654321" def test_no_kvk_without_context(self, extractor): # Random 8-digit number without KvK context should be ignored text = "The building number is 12345678" identifiers = extractor.extract_kvk_numbers(text, context_required=True) assert len(identifiers) == 0 def test_kvk_without_context_requirement(self, extractor): text = "The number is 12345678" identifiers = extractor.extract_kvk_numbers(text, context_required=False) # Should extract even without context when context_required=False assert len(identifiers) >= 1 def test_reject_year_as_kvk(self, extractor): # Should not extract years as KvK numbers text = "KvK: 12345678. Founded in 1995." identifiers = extractor.extract_kvk_numbers(text, context_required=False) # Should extract 12345678 but not 1995 values = [id.identifier_value for id in identifiers] assert "12345678" in values assert "1995" not in values class TestURLExtraction: """Test URL extraction""" def test_extract_http_url(self, extractor): text = "Visit http://www.rijksmuseum.nl for more info." urls = extractor.extract_urls(text) assert len(urls) == 1 assert "rijksmuseum.nl" in urls[0] def test_extract_https_url(self, extractor): text = "Website: https://www.nationaalarchief.nl" urls = extractor.extract_urls(text) assert len(urls) == 1 assert "nationaalarchief.nl" in urls[0] def test_extract_multiple_urls(self, extractor): text = "Sites: https://rijksmuseum.nl, https://vangoghmuseum.nl, https://stedelijk.nl" urls = extractor.extract_urls(text) assert len(urls) == 3 def test_filter_urls_by_domain(self, extractor): text = "Museums: https://rijksmuseum.nl, Libraries: https://kb.nl, Archives: https://gahetna.nl" museum_urls = extractor.extract_urls(text, filter_domains=["museum"]) assert len(museum_urls) == 1 assert "rijksmuseum.nl" in museum_urls[0] def test_url_with_path(self, extractor): text = "Collection: https://www.rijksmuseum.nl/en/rijksstudio" urls = extractor.extract_urls(text) assert len(urls) == 1 assert "rijksstudio" in urls[0] class TestExtractAll: """Test extracting all identifier types at once""" def test_extract_all_from_rijksmuseum_text(self, extractor): text = """ The Rijksmuseum is a Dutch national museum in Amsterdam. The museum has an ISIL code NL-AsdRM. It is located at Museumstraat 1, 1071 XX Amsterdam. The museum holds over 1 million objects in its collection. Their digital collection is available at https://www.rijksmuseum.nl/en/rijksstudio. """ identifiers = extractor.extract_all(text, include_urls=True) # Should find ISIL code and URL schemes = {id.identifier_scheme for id in identifiers} assert "ISIL" in schemes assert "URL" in schemes # Check ISIL value isil_ids = [id for id in identifiers if id.identifier_scheme == "ISIL"] assert len(isil_ids) == 1 assert isil_ids[0].identifier_value == "NL-ASDRM" def test_extract_all_from_nationaal_archief_text(self, extractor): text = """ The Nationaal Archief (National Archive) is located in The Hague, Netherlands. ISIL: NL-HaNA. Address: Prins Willem-Alexanderhof 20, 2595 BE Den Haag. Their catalog is searchable at https://www.nationaalarchief.nl. """ identifiers = extractor.extract_all(text, include_urls=True) schemes = {id.identifier_scheme for id in identifiers} assert "ISIL" in schemes assert "URL" in schemes def test_deduplication(self, extractor): # Same ISIL code mentioned twice text = "ISIL code NL-AsdRM. The code NL-AsdRM identifies the museum." identifiers = extractor.extract_all(text) # Should only return one instance isil_ids = [id for id in identifiers if id.identifier_scheme == "ISIL"] assert len(isil_ids) == 1 def test_mixed_identifiers(self, extractor): text = """ Rijksmuseum (Q190804) has ISIL code NL-AsdRM. VIAF: viaf.org/viaf/123456789. KvK: 12345678. Website: https://rijksmuseum.nl """ identifiers = extractor.extract_all(text, include_urls=True) schemes = {id.identifier_scheme for id in identifiers} assert "ISIL" in schemes assert "Wikidata" in schemes assert "VIAF" in schemes assert "KvK" in schemes assert "URL" in schemes class TestExtractWithContext: """Test extraction with surrounding context""" def test_extract_with_context_basic(self, extractor): text = "The Rijksmuseum has ISIL code NL-AsdRM and is located in Amsterdam." results = extractor.extract_with_context(text, context_window=20) # Should find NL-AsdRM assert len(results) > 0 # Find the ISIL code result isil_result = next((r for r in results if r["identifier"].identifier_scheme == "ISIL"), None) assert isil_result is not None # Check context includes nearby words assert "ISIL code" in isil_result["context_before"] or "code" in isil_result["context_before"] assert "and is" in isil_result["context_after"] or "and" in isil_result["context_after"] def test_context_window_size(self, extractor): text = "Museum ISIL: NL-AsdRM details" results = extractor.extract_with_context(text, context_window=5) isil_result = next((r for r in results if r["identifier"].identifier_scheme == "ISIL"), None) assert isil_result is not None # With window=5, should get ~5 chars before/after assert len(isil_result["context_before"]) <= 6 # "SIL: " = 5 chars assert len(isil_result["context_after"]) <= 6 def test_position_information(self, extractor): text = "ISIL: NL-AsdRM" results = extractor.extract_with_context(text, context_window=10) isil_result = next((r for r in results if r["identifier"].identifier_scheme == "ISIL"), None) assert isil_result is not None assert "position" in isil_result assert isil_result["position"] == text.index("NL-AsdRM") class TestEdgeCases: """Test edge cases and boundary conditions""" def test_empty_string(self, extractor): identifiers = extractor.extract_all("") assert len(identifiers) == 0 def test_no_identifiers(self, extractor): text = "This is just plain text with no identifiers." identifiers = extractor.extract_all(text) assert len(identifiers) == 0 def test_special_characters(self, extractor): text = "ISIL: NL-AsdRM; other info" identifiers = extractor.extract_isil_codes(text) assert len(identifiers) == 1 assert identifiers[0].identifier_value == "NL-ASDRM" def test_multiline_text(self, extractor): text = """ Line 1: ISIL NL-AsdRM Line 2: Wikidata Q190804 Line 3: Website https://rijksmuseum.nl """ identifiers = extractor.extract_all(text, include_urls=True) schemes = {id.identifier_scheme for id in identifiers} assert "ISIL" in schemes assert "Wikidata" in schemes assert "URL" in schemes def test_unicode_text(self, extractor): text = "Het Rijksmuseum heeft ISIL-code NL-AsdRM. Bezoek https://rijksmuseum.nl" identifiers = extractor.extract_all(text, include_urls=True) assert len(identifiers) >= 2 # ISIL and URL def test_partial_matches_not_extracted(self, extractor): # Should not extract partial ISIL codes text = "Code: XNL-AsdRM is invalid" # Extra X prefix identifiers = extractor.extract_isil_codes(text) # Should not match because of the X prefix assert len([id for id in identifiers if id.identifier_value == "XNL-ASDRM"]) == 0