glam/scripts/add_isil_gap_notes.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

235 lines
9.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Add ISIL gap documentation to Latin American institution provenance records.
This script:
1. Reads the Wikidata-enriched Latin American institutions dataset
2. Adds standardized provenance notes explaining ISIL unavailability
3. Documents Wikidata enrichment results (0 ISIL codes found among 2,409 institutions)
4. References the enrichment strategy document for details
5. Writes updated YAML with comprehensive gap documentation
Context:
- Brazil, Mexico, and Chile lack publicly accessible ISIL registries
- Wikidata enrichment (2025-11-06) confirmed ISIL unavailability
- 0 ISIL codes found among 2,409 Wikidata GLAM institutions in these countries
Usage:
python scripts/add_isil_gap_notes.py
Input:
data/instances/latin_american_institutions_enriched.yaml (304 institutions)
Output:
data/instances/latin_american_institutions_documented.yaml
"""
import sys
from pathlib import Path
from datetime import datetime, timezone
import yaml
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
class ISILGapDocumenter:
"""Add ISIL gap documentation to institution provenance records."""
# Country-specific ISIL gap notes
GAP_NOTES = {
'BR': (
"ISIL code unavailable: No public ISIL registry exists for Brazil. "
"Wikidata enrichment (2025-11-06) found 0 ISIL codes among 2,409 "
"Brazilian GLAM institutions. National registration agency (Biblioteca "
"Nacional do Brasil or IBICT) has not published public ISIL directory. "
"Recommended action: Direct outreach to Biblioteca Nacional do Brasil. "
"See docs/isil_enrichment_strategy.md for details."
),
'MX': (
"ISIL code unavailable: No public ISIL registry exists for Mexico. "
"Wikidata enrichment (2025-11-06) found 0 ISIL codes among 2,409 "
"Mexican GLAM institutions. National registration agency (Biblioteca "
"Nacional de México under UNAM) has not published public ISIL directory. "
"Recommended action: Direct outreach to Biblioteca Nacional de México. "
"See docs/isil_enrichment_strategy.md for details."
),
'CL': (
"ISIL code unavailable: No public ISIL registry exists for Chile. "
"Wikidata enrichment (2025-11-06) found 0 ISIL codes among 2,409 "
"Chilean GLAM institutions. National registration agency (Biblioteca "
"Nacional de Chile or Servicio Nacional del Patrimonio Cultural) has not "
"published public ISIL directory. Recommended action: Direct outreach to "
"Biblioteca Nacional de Chile. See docs/isil_enrichment_strategy.md for details."
)
}
def __init__(self, input_file: Path, output_file: Path):
self.input_file = input_file
self.output_file = output_file
self.stats = {
'total_institutions': 0,
'notes_added': 0,
'notes_updated': 0,
'by_country': {'BR': 0, 'MX': 0, 'CL': 0}
}
def run(self) -> None:
"""Main execution: read, process, write, report."""
print(f"Reading institutions from: {self.input_file}")
institutions = self._load_institutions()
print("Adding ISIL gap documentation to provenance records...")
updated_institutions = self._add_gap_notes(institutions)
print(f"Writing documented institutions to: {self.output_file}")
self._save_institutions(updated_institutions)
self._print_report()
def _load_institutions(self) -> list[dict]:
"""Load institutions from YAML file."""
with open(self.input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
if not isinstance(institutions, list):
raise ValueError(f"Expected list of institutions, got {type(institutions)}")
self.stats['total_institutions'] = len(institutions)
return institutions
def _add_gap_notes(self, institutions: list[dict]) -> list[dict]:
"""Add ISIL gap notes to each institution's provenance."""
updated = []
for inst in institutions:
country = self._get_country_code(inst)
if country and country in self.GAP_NOTES:
inst = self._update_provenance_notes(inst, country)
self.stats['by_country'][country] += 1
updated.append(inst)
return updated
def _get_country_code(self, institution: dict) -> str | None:
"""Extract country code from institution locations."""
locations = institution.get('locations', [])
if not locations:
return None
# Get country from first location
return locations[0].get('country')
def _update_provenance_notes(self, institution: dict, country: str) -> dict:
"""Add or update provenance notes with ISIL gap information."""
# Ensure provenance exists
if 'provenance' not in institution:
institution['provenance'] = {}
provenance = institution['provenance']
existing_notes = provenance.get('notes', '').strip()
gap_note = self.GAP_NOTES[country]
# Check if ISIL gap note already present
if 'ISIL code unavailable' in existing_notes:
# Update existing note
# Replace old note with new standardized version
lines = existing_notes.split('\n')
filtered = [line for line in lines if 'ISIL code unavailable' not in line]
filtered.append(gap_note)
provenance['notes'] = '\n'.join(filtered).strip()
self.stats['notes_updated'] += 1
else:
# Add new note
if existing_notes:
provenance['notes'] = f"{existing_notes}\n{gap_note}"
else:
provenance['notes'] = gap_note
self.stats['notes_added'] += 1
return institution
def _save_institutions(self, institutions: list[dict]) -> None:
"""Save updated institutions to YAML file."""
# Add metadata comment at top
timestamp = datetime.now(timezone.utc).isoformat()
with open(self.output_file, 'w', encoding='utf-8') as f:
# Write header comment
f.write("---\n")
f.write("# Latin American GLAM Institutions - ISIL Gap Documented\n")
f.write(f"# Generated: {timestamp}\n")
f.write("#\n")
f.write("# ISIL Gap Documentation Summary:\n")
f.write(f"# - Total institutions: {self.stats['total_institutions']}\n")
f.write(f"# - New notes added: {self.stats['notes_added']}\n")
f.write(f"# - Notes updated: {self.stats['notes_updated']}\n")
f.write(f"# - Brazil (BR): {self.stats['by_country']['BR']} institutions\n")
f.write(f"# - Mexico (MX): {self.stats['by_country']['MX']} institutions\n")
f.write(f"# - Chile (CL): {self.stats['by_country']['CL']} institutions\n")
f.write("#\n")
f.write("# ISIL Research Findings:\n")
f.write("# - No public ISIL registries found for BR, MX, CL\n")
f.write("# - Wikidata enrichment (2025-11-06): 0 ISIL codes among 2,409 institutions\n")
f.write("# - Recommended: Direct outreach to national libraries\n")
f.write("# - Reference: docs/isil_enrichment_strategy.md\n")
f.write("\n")
# Write institutions as YAML
yaml.dump(
institutions,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=100,
indent=2
)
def _print_report(self) -> None:
"""Print summary report of gap documentation."""
print("\n" + "=" * 70)
print("ISIL GAP DOCUMENTATION COMPLETE")
print("=" * 70)
print(f"Total institutions: {self.stats['total_institutions']}")
print(f"New notes added: {self.stats['notes_added']}")
print(f"Notes updated: {self.stats['notes_updated']}")
print()
print("Documentation by country:")
print(f" Brazil (BR): {self.stats['by_country']['BR']} institutions")
print(f" Mexico (MX): {self.stats['by_country']['MX']} institutions")
print(f" Chile (CL): {self.stats['by_country']['CL']} institutions")
print()
print("Gap Notes Added:")
print("-" * 70)
for country, note in self.GAP_NOTES.items():
print(f"\n[{country}]")
print(note)
print()
print("=" * 70)
print(f"Output written to: {self.output_file}")
print("=" * 70)
def main():
"""Main entry point."""
# File paths
project_root = Path(__file__).parent.parent
input_file = project_root / "data" / "instances" / "latin_american_institutions_enriched.yaml"
output_file = project_root / "data" / "instances" / "latin_american_institutions_documented.yaml"
# Validate input exists
if not input_file.exists():
print(f"ERROR: Input file not found: {input_file}")
print("Expected: data/instances/latin_american_institutions_enriched.yaml")
sys.exit(1)
# Run documenter
documenter = ISILGapDocumenter(input_file, output_file)
documenter.run()
if __name__ == "__main__":
main()