- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
492 lines
20 KiB
Python
492 lines
20 KiB
Python
"""
|
|
Tests for Bibliographic Module (schemas/bibliographic.yaml)
|
|
|
|
Tests the bibliographic schema including:
|
|
- Publication entities (journal articles, books, conference papers)
|
|
- Citation relationships
|
|
- Document sections
|
|
- Author/contributor metadata
|
|
- Open access status tracking
|
|
- Integration with HeritageCustodian.publications
|
|
|
|
Test Coverage Goals:
|
|
1. Validate publication instance files in data/instances/publications/
|
|
2. Test publication type variations (journal articles, books, chapters, datasets)
|
|
3. Validate citation relationships
|
|
4. Test integration with heritage institutions
|
|
5. Validate provenance metadata
|
|
"""
|
|
|
|
import pytest
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
|
|
# Test data paths
|
|
PUBLICATIONS_DIR = Path(__file__).parent.parent / "data" / "instances" / "publications"
|
|
FIXTURES_DIR = Path(__file__).parent / "fixtures" / "publications"
|
|
|
|
|
|
class TestPublicationInstanceFiles:
|
|
"""Test that publication instance files are valid YAML and well-formed."""
|
|
|
|
@pytest.fixture
|
|
def publication_files(self) -> List[Path]:
|
|
"""Get all publication YAML files."""
|
|
return list(PUBLICATIONS_DIR.glob("*.yaml"))
|
|
|
|
def test_publications_directory_exists(self):
|
|
"""Verify publications directory exists."""
|
|
assert PUBLICATIONS_DIR.exists(), f"Publications directory not found: {PUBLICATIONS_DIR}"
|
|
|
|
def test_publication_files_found(self, publication_files):
|
|
"""Verify at least one publication file exists."""
|
|
assert len(publication_files) > 0, "No publication files found"
|
|
|
|
def test_all_publication_files_valid_yaml(self, publication_files):
|
|
"""Verify all publication files are valid YAML."""
|
|
for pub_file in publication_files:
|
|
try:
|
|
with open(pub_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
assert data is not None, f"{pub_file.name} is empty"
|
|
except yaml.YAMLError as e:
|
|
pytest.fail(f"{pub_file.name} has invalid YAML syntax: {e}")
|
|
|
|
def test_publication_files_are_lists(self, publication_files):
|
|
"""Verify publication files contain lists of publications."""
|
|
for pub_file in publication_files:
|
|
with open(pub_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
assert isinstance(data, list), f"{pub_file.name} should contain a list"
|
|
assert len(data) > 0, f"{pub_file.name} list is empty"
|
|
|
|
|
|
class TestPublicationSchema:
|
|
"""Test publication data conforms to bibliographic schema."""
|
|
|
|
@pytest.fixture
|
|
def heritage_linked_pubs(self) -> List[Dict[str, Any]]:
|
|
"""Load heritage-linked publications."""
|
|
with open(PUBLICATIONS_DIR / "heritage_linked_publications.yaml", 'r') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
@pytest.fixture
|
|
def semantic_web_papers(self) -> List[Dict[str, Any]]:
|
|
"""Load semantic web papers."""
|
|
with open(PUBLICATIONS_DIR / "semantic_web_papers.yaml", 'r') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
@pytest.fixture
|
|
def diverse_heritage_pubs(self) -> List[Dict[str, Any]]:
|
|
"""Load diverse heritage publications (books, chapters, reports)."""
|
|
with open(PUBLICATIONS_DIR / "diverse_heritage_publications.yaml", 'r') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def test_required_fields_present(self, heritage_linked_pubs):
|
|
"""Verify required fields are present in all publications."""
|
|
required_fields = ['publication_id', 'title', 'publication_type']
|
|
|
|
for pub in heritage_linked_pubs:
|
|
for field in required_fields:
|
|
assert field in pub, f"Missing required field '{field}' in publication: {pub.get('title', 'Unknown')}"
|
|
assert pub[field], f"Field '{field}' is empty in publication: {pub.get('title', 'Unknown')}"
|
|
|
|
def test_publication_types_valid(self, heritage_linked_pubs, semantic_web_papers, diverse_heritage_pubs):
|
|
"""Verify publication types are from valid enum."""
|
|
valid_types = [
|
|
'JOURNAL_ARTICLE',
|
|
'BOOK',
|
|
'BOOK_CHAPTER',
|
|
'CONFERENCE_PAPER',
|
|
'DATASET',
|
|
'PREPRINT',
|
|
'TECHNICAL_REPORT',
|
|
'THESIS',
|
|
'WORKING_PAPER',
|
|
'REVIEW'
|
|
]
|
|
|
|
all_pubs = heritage_linked_pubs + semantic_web_papers + diverse_heritage_pubs
|
|
|
|
for pub in all_pubs:
|
|
pub_type = pub.get('publication_type')
|
|
assert pub_type in valid_types, (
|
|
f"Invalid publication_type '{pub_type}' in publication: {pub.get('title', 'Unknown')}"
|
|
)
|
|
|
|
def test_authors_structure(self, heritage_linked_pubs):
|
|
"""Verify author metadata structure."""
|
|
for pub in heritage_linked_pubs:
|
|
if 'authors' in pub:
|
|
assert isinstance(pub['authors'], list), f"Authors should be a list in: {pub['title']}"
|
|
|
|
for author in pub['authors']:
|
|
# Each author should have person_name
|
|
assert 'person_name' in author, f"Author missing person_name in: {pub['title']}"
|
|
|
|
# If ORCID present, should be valid format
|
|
if 'orcid' in author:
|
|
orcid = author['orcid']
|
|
assert orcid, f"ORCID is empty for {author['person_name']}"
|
|
# ORCID format: 0000-0002-1825-0097
|
|
parts = orcid.split('-')
|
|
assert len(parts) == 4, f"Invalid ORCID format: {orcid}"
|
|
|
|
# If affiliation present, check structure
|
|
if 'affiliation' in author:
|
|
affiliation = author['affiliation']
|
|
assert 'organization_name' in affiliation, (
|
|
f"Affiliation missing organization_name for {author['person_name']}"
|
|
)
|
|
|
|
def test_doi_format(self, heritage_linked_pubs, semantic_web_papers):
|
|
"""Verify DOI format when present."""
|
|
all_pubs = heritage_linked_pubs + semantic_web_papers
|
|
|
|
for pub in all_pubs:
|
|
if 'doi' in pub and pub['doi']:
|
|
doi = pub['doi']
|
|
# DOI format: 10.XXXX/...
|
|
assert doi.startswith('10.'), f"Invalid DOI format: {doi} in {pub['title']}"
|
|
|
|
def test_open_access_status_valid(self, heritage_linked_pubs, diverse_heritage_pubs):
|
|
"""Verify open access status values are valid."""
|
|
valid_statuses = [
|
|
'FULLY_OPEN_ACCESS',
|
|
'OPEN_ACCESS_REPOSITORY',
|
|
'HYBRID_OPEN_ACCESS',
|
|
'DELAYED_OPEN_ACCESS',
|
|
'CLOSED_ACCESS',
|
|
'UNKNOWN'
|
|
]
|
|
|
|
all_pubs = heritage_linked_pubs + diverse_heritage_pubs
|
|
|
|
for pub in all_pubs:
|
|
if 'open_access_status' in pub:
|
|
status = pub['open_access_status']
|
|
assert status in valid_statuses, (
|
|
f"Invalid open_access_status '{status}' in: {pub['title']}"
|
|
)
|
|
|
|
def test_provenance_present(self, heritage_linked_pubs):
|
|
"""Verify provenance metadata is present."""
|
|
for pub in heritage_linked_pubs:
|
|
assert 'provenance' in pub, f"Missing provenance in publication: {pub['title']}"
|
|
|
|
prov = pub['provenance']
|
|
assert 'data_source' in prov, f"Missing provenance.data_source in: {pub['title']}"
|
|
assert 'extraction_date' in prov, f"Missing provenance.extraction_date in: {pub['title']}"
|
|
|
|
|
|
class TestCitationRelationships:
|
|
"""Test citation metadata and relationships."""
|
|
|
|
@pytest.fixture
|
|
def citation_data(self) -> List[Dict[str, Any]]:
|
|
"""Load citation relationships."""
|
|
with open(PUBLICATIONS_DIR / "citation_relationships.yaml", 'r') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def test_citation_relationships_file_exists(self):
|
|
"""Verify citation relationships file exists."""
|
|
citation_file = PUBLICATIONS_DIR / "citation_relationships.yaml"
|
|
assert citation_file.exists(), f"Citation relationships file not found: {citation_file}"
|
|
|
|
def test_citation_structure(self, citation_data):
|
|
"""Verify citation structure is correct."""
|
|
for item in citation_data:
|
|
if 'citations' in item:
|
|
citations = item['citations']
|
|
assert isinstance(citations, list), "Citations should be a list"
|
|
|
|
for citation in citations:
|
|
# Required fields
|
|
assert 'citation_id' in citation, "Citation missing citation_id"
|
|
assert 'citing_work' in citation, "Citation missing citing_work"
|
|
assert 'cited_work' in citation, "Citation missing cited_work"
|
|
|
|
# Optional but recommended
|
|
if 'citation_type' in citation:
|
|
# CiTO citation types
|
|
valid_types = [
|
|
'CITES',
|
|
'CITES_AS_AUTHORITY',
|
|
'CITES_AS_DATA_SOURCE',
|
|
'EXTENDS',
|
|
'USES_METHOD_IN',
|
|
'DISAGREES_WITH',
|
|
'SUPPORTS',
|
|
'REFUTES'
|
|
]
|
|
# Note: This is a sample, actual enum may have more types
|
|
|
|
def test_citation_ids_unique(self, citation_data):
|
|
"""Verify citation IDs are unique."""
|
|
citation_ids = []
|
|
|
|
for item in citation_data:
|
|
if 'citations' in item:
|
|
for citation in item['citations']:
|
|
cit_id = citation.get('citation_id')
|
|
if cit_id:
|
|
citation_ids.append(cit_id)
|
|
|
|
# Check for duplicates
|
|
duplicates = [cid for cid in citation_ids if citation_ids.count(cid) > 1]
|
|
assert len(duplicates) == 0, f"Duplicate citation IDs found: {set(duplicates)}"
|
|
|
|
|
|
class TestPublicationFixtures:
|
|
"""Test publication fixture files used for testing."""
|
|
|
|
def test_journal_article_fixture(self):
|
|
"""Verify journal article fixture is valid."""
|
|
fixture_file = FIXTURES_DIR / "journal_article_example.yaml"
|
|
assert fixture_file.exists(), f"Journal article fixture not found: {fixture_file}"
|
|
|
|
with open(fixture_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
assert isinstance(data, list), "Fixture should be a list"
|
|
assert len(data) > 0, "Fixture is empty"
|
|
|
|
pub = data[0]
|
|
assert pub['publication_type'] == 'JOURNAL_ARTICLE'
|
|
assert 'title' in pub
|
|
assert 'authors' in pub
|
|
|
|
def test_conference_paper_fixture(self):
|
|
"""Verify conference paper fixture is valid."""
|
|
fixture_file = FIXTURES_DIR / "conference_paper_example.yaml"
|
|
|
|
if fixture_file.exists():
|
|
with open(fixture_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
assert isinstance(data, list), "Fixture should be a list"
|
|
pub = data[0]
|
|
# Note: This fixture contains a REVIEW_ARTICLE (ACM Computing Surveys paper)
|
|
# originally presented as ISWC 2020 keynote
|
|
assert pub['publication_type'] == 'REVIEW_ARTICLE'
|
|
|
|
def test_book_chapter_fixture(self):
|
|
"""Verify book chapter fixture is valid."""
|
|
fixture_file = FIXTURES_DIR / "book_chapter_example.yaml"
|
|
|
|
if fixture_file.exists():
|
|
with open(fixture_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
assert isinstance(data, list), "Fixture should be a list"
|
|
pub = data[0]
|
|
assert pub['publication_type'] == 'BOOK_CHAPTER'
|
|
|
|
|
|
class TestInstitutionPublicationIntegration:
|
|
"""Test integration between HeritageCustodian and publications."""
|
|
|
|
def test_institution_with_publications_fixture(self):
|
|
"""Verify institution with publications fixture is valid."""
|
|
fixture_file = FIXTURES_DIR / "institution_with_publications.yaml"
|
|
assert fixture_file.exists(), f"Institution with publications fixture not found: {fixture_file}"
|
|
|
|
with open(fixture_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
assert isinstance(data, list), "Fixture should be a list"
|
|
institution = data[0]
|
|
|
|
# Check it's a heritage institution
|
|
assert 'institution_type' in institution
|
|
|
|
# Check publications field
|
|
assert 'publications' in institution, "Institution should have publications field"
|
|
publications = institution['publications']
|
|
assert isinstance(publications, list), "Publications should be a list"
|
|
assert len(publications) > 0, "Institution should have at least one publication"
|
|
|
|
# Verify publication structure
|
|
for pub in publications:
|
|
assert 'publication_id' in pub
|
|
assert 'title' in pub
|
|
assert 'publication_type' in pub
|
|
|
|
def test_nisv_publications(self):
|
|
"""Test Netherlands Institute for Sound and Vision publications."""
|
|
fixture_file = FIXTURES_DIR / "institution_with_publications.yaml"
|
|
|
|
with open(fixture_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
nisv = data[0]
|
|
assert nisv['name'] == "Netherlands Institute for Sound and Vision"
|
|
|
|
# NISV should have publications
|
|
pubs = nisv['publications']
|
|
assert len(pubs) >= 3, "NISV should have at least 3 publications"
|
|
|
|
# Check publication types diversity
|
|
pub_types = [p['publication_type'] for p in pubs]
|
|
assert 'JOURNAL_ARTICLE' in pub_types, "Should have at least one journal article"
|
|
assert 'CONFERENCE_PAPER' in pub_types or 'DATASET' in pub_types, "Should have diverse publication types"
|
|
|
|
|
|
class TestDocumentSections:
|
|
"""Test document section metadata."""
|
|
|
|
@pytest.fixture
|
|
def publications_with_sections(self) -> List[Dict[str, Any]]:
|
|
"""Load publications that have document sections."""
|
|
fixture_file = FIXTURES_DIR / "journal_article_example.yaml"
|
|
with open(fixture_file, 'r') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def test_document_sections_structure(self, publications_with_sections):
|
|
"""Verify document section structure."""
|
|
for pub in publications_with_sections:
|
|
if 'document_sections' in pub:
|
|
sections = pub['document_sections']
|
|
assert isinstance(sections, list), "Document sections should be a list"
|
|
|
|
for section in sections:
|
|
# Check required fields
|
|
assert 'section_id' in section, "Section missing section_id"
|
|
assert 'section_type' in section, "Section missing section_type"
|
|
|
|
# Valid section types
|
|
valid_types = [
|
|
'ABSTRACT',
|
|
'INTRODUCTION',
|
|
'METHODS',
|
|
'RESULTS',
|
|
'DISCUSSION',
|
|
'CONCLUSION',
|
|
'BIBLIOGRAPHY',
|
|
'ACKNOWLEDGMENTS'
|
|
]
|
|
|
|
if section.get('section_type'):
|
|
# Note: This is informational, not enforcing strict enum
|
|
pass
|
|
|
|
def test_section_order(self, publications_with_sections):
|
|
"""Verify section_order is sequential."""
|
|
for pub in publications_with_sections:
|
|
if 'document_sections' in pub:
|
|
sections = pub['document_sections']
|
|
|
|
# Extract section orders
|
|
orders = [s.get('section_order') for s in sections if 'section_order' in s]
|
|
|
|
if len(orders) > 0:
|
|
# Check orders are sequential (1, 2, 3, ...)
|
|
assert min(orders) >= 1, "Section order should start at 1"
|
|
# Allow gaps but ensure uniqueness
|
|
assert len(orders) == len(set(orders)), "Section orders should be unique"
|
|
|
|
|
|
class TestPublicationCounts:
|
|
"""Test publication counts and coverage."""
|
|
|
|
def test_total_publication_count(self):
|
|
"""Count total publications in dataset."""
|
|
total_count = 0
|
|
|
|
for pub_file in PUBLICATIONS_DIR.glob("*.yaml"):
|
|
with open(pub_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if isinstance(data, list):
|
|
total_count += len(data)
|
|
|
|
# Should have publications (exact count may vary)
|
|
assert total_count > 0, "No publications found in dataset"
|
|
print(f"\n✅ Total publications in dataset: {total_count}")
|
|
|
|
def test_publication_type_distribution(self):
|
|
"""Analyze distribution of publication types."""
|
|
type_counts = {}
|
|
|
|
for pub_file in PUBLICATIONS_DIR.glob("*.yaml"):
|
|
with open(pub_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if isinstance(data, list):
|
|
for pub in data:
|
|
pub_type = pub.get('publication_type', 'UNKNOWN')
|
|
type_counts[pub_type] = type_counts.get(pub_type, 0) + 1
|
|
|
|
print(f"\n✅ Publication type distribution:")
|
|
for pub_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {pub_type}: {count}")
|
|
|
|
assert len(type_counts) > 0, "No publication types found"
|
|
|
|
|
|
class TestPublicationQuality:
|
|
"""Test data quality metrics for publications."""
|
|
|
|
def test_doi_coverage(self):
|
|
"""Check what percentage of publications have DOIs."""
|
|
total = 0
|
|
with_doi = 0
|
|
|
|
for pub_file in PUBLICATIONS_DIR.glob("*.yaml"):
|
|
with open(pub_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if isinstance(data, list):
|
|
for pub in data:
|
|
total += 1
|
|
if 'doi' in pub and pub['doi']:
|
|
with_doi += 1
|
|
|
|
coverage = (with_doi / total * 100) if total > 0 else 0
|
|
print(f"\n✅ DOI coverage: {with_doi}/{total} ({coverage:.1f}%)")
|
|
|
|
# At least some publications should have DOIs
|
|
assert with_doi > 0, "No publications with DOIs found"
|
|
|
|
def test_orcid_coverage(self):
|
|
"""Check what percentage of authors have ORCIDs."""
|
|
total_authors = 0
|
|
with_orcid = 0
|
|
|
|
for pub_file in PUBLICATIONS_DIR.glob("*.yaml"):
|
|
with open(pub_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if isinstance(data, list):
|
|
for pub in data:
|
|
if 'authors' in pub:
|
|
for author in pub['authors']:
|
|
total_authors += 1
|
|
if 'orcid' in author and author['orcid']:
|
|
with_orcid += 1
|
|
|
|
coverage = (with_orcid / total_authors * 100) if total_authors > 0 else 0
|
|
print(f"\n✅ ORCID coverage: {with_orcid}/{total_authors} ({coverage:.1f}%)")
|
|
|
|
def test_open_access_coverage(self):
|
|
"""Check distribution of open access status."""
|
|
oa_counts = {}
|
|
|
|
for pub_file in PUBLICATIONS_DIR.glob("*.yaml"):
|
|
with open(pub_file, 'r') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if isinstance(data, list):
|
|
for pub in data:
|
|
oa_status = pub.get('open_access_status', 'UNKNOWN')
|
|
oa_counts[oa_status] = oa_counts.get(oa_status, 0) + 1
|
|
|
|
print(f"\n✅ Open access distribution:")
|
|
for status, count in sorted(oa_counts.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {status}: {count}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Run tests with pytest
|
|
pytest.main([__file__, "-v", "--tb=short"])
|