- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
235 lines
9.5 KiB
Python
Executable file
235 lines
9.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Add ISIL gap documentation to Latin American institution provenance records.
|
|
|
|
This script:
|
|
1. Reads the Wikidata-enriched Latin American institutions dataset
|
|
2. Adds standardized provenance notes explaining ISIL unavailability
|
|
3. Documents Wikidata enrichment results (0 ISIL codes found among 2,409 institutions)
|
|
4. References the enrichment strategy document for details
|
|
5. Writes updated YAML with comprehensive gap documentation
|
|
|
|
Context:
|
|
- Brazil, Mexico, and Chile lack publicly accessible ISIL registries
|
|
- Wikidata enrichment (2025-11-06) confirmed ISIL unavailability
|
|
- 0 ISIL codes found among 2,409 Wikidata GLAM institutions in these countries
|
|
|
|
Usage:
|
|
python scripts/add_isil_gap_notes.py
|
|
|
|
Input:
|
|
data/instances/latin_american_institutions_enriched.yaml (304 institutions)
|
|
|
|
Output:
|
|
data/instances/latin_american_institutions_documented.yaml
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import yaml
|
|
|
|
# Add src to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
|
|
class ISILGapDocumenter:
|
|
"""Add ISIL gap documentation to institution provenance records."""
|
|
|
|
# Country-specific ISIL gap notes
|
|
GAP_NOTES = {
|
|
'BR': (
|
|
"ISIL code unavailable: No public ISIL registry exists for Brazil. "
|
|
"Wikidata enrichment (2025-11-06) found 0 ISIL codes among 2,409 "
|
|
"Brazilian GLAM institutions. National registration agency (Biblioteca "
|
|
"Nacional do Brasil or IBICT) has not published public ISIL directory. "
|
|
"Recommended action: Direct outreach to Biblioteca Nacional do Brasil. "
|
|
"See docs/isil_enrichment_strategy.md for details."
|
|
),
|
|
'MX': (
|
|
"ISIL code unavailable: No public ISIL registry exists for Mexico. "
|
|
"Wikidata enrichment (2025-11-06) found 0 ISIL codes among 2,409 "
|
|
"Mexican GLAM institutions. National registration agency (Biblioteca "
|
|
"Nacional de México under UNAM) has not published public ISIL directory. "
|
|
"Recommended action: Direct outreach to Biblioteca Nacional de México. "
|
|
"See docs/isil_enrichment_strategy.md for details."
|
|
),
|
|
'CL': (
|
|
"ISIL code unavailable: No public ISIL registry exists for Chile. "
|
|
"Wikidata enrichment (2025-11-06) found 0 ISIL codes among 2,409 "
|
|
"Chilean GLAM institutions. National registration agency (Biblioteca "
|
|
"Nacional de Chile or Servicio Nacional del Patrimonio Cultural) has not "
|
|
"published public ISIL directory. Recommended action: Direct outreach to "
|
|
"Biblioteca Nacional de Chile. See docs/isil_enrichment_strategy.md for details."
|
|
)
|
|
}
|
|
|
|
def __init__(self, input_file: Path, output_file: Path):
|
|
self.input_file = input_file
|
|
self.output_file = output_file
|
|
self.stats = {
|
|
'total_institutions': 0,
|
|
'notes_added': 0,
|
|
'notes_updated': 0,
|
|
'by_country': {'BR': 0, 'MX': 0, 'CL': 0}
|
|
}
|
|
|
|
def run(self) -> None:
|
|
"""Main execution: read, process, write, report."""
|
|
print(f"Reading institutions from: {self.input_file}")
|
|
institutions = self._load_institutions()
|
|
|
|
print("Adding ISIL gap documentation to provenance records...")
|
|
updated_institutions = self._add_gap_notes(institutions)
|
|
|
|
print(f"Writing documented institutions to: {self.output_file}")
|
|
self._save_institutions(updated_institutions)
|
|
|
|
self._print_report()
|
|
|
|
def _load_institutions(self) -> list[dict]:
|
|
"""Load institutions from YAML file."""
|
|
with open(self.input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
if not isinstance(institutions, list):
|
|
raise ValueError(f"Expected list of institutions, got {type(institutions)}")
|
|
|
|
self.stats['total_institutions'] = len(institutions)
|
|
return institutions
|
|
|
|
def _add_gap_notes(self, institutions: list[dict]) -> list[dict]:
|
|
"""Add ISIL gap notes to each institution's provenance."""
|
|
updated = []
|
|
|
|
for inst in institutions:
|
|
country = self._get_country_code(inst)
|
|
|
|
if country and country in self.GAP_NOTES:
|
|
inst = self._update_provenance_notes(inst, country)
|
|
self.stats['by_country'][country] += 1
|
|
|
|
updated.append(inst)
|
|
|
|
return updated
|
|
|
|
def _get_country_code(self, institution: dict) -> str | None:
|
|
"""Extract country code from institution locations."""
|
|
locations = institution.get('locations', [])
|
|
if not locations:
|
|
return None
|
|
|
|
# Get country from first location
|
|
return locations[0].get('country')
|
|
|
|
def _update_provenance_notes(self, institution: dict, country: str) -> dict:
|
|
"""Add or update provenance notes with ISIL gap information."""
|
|
# Ensure provenance exists
|
|
if 'provenance' not in institution:
|
|
institution['provenance'] = {}
|
|
|
|
provenance = institution['provenance']
|
|
existing_notes = provenance.get('notes', '').strip()
|
|
gap_note = self.GAP_NOTES[country]
|
|
|
|
# Check if ISIL gap note already present
|
|
if 'ISIL code unavailable' in existing_notes:
|
|
# Update existing note
|
|
# Replace old note with new standardized version
|
|
lines = existing_notes.split('\n')
|
|
filtered = [line for line in lines if 'ISIL code unavailable' not in line]
|
|
filtered.append(gap_note)
|
|
provenance['notes'] = '\n'.join(filtered).strip()
|
|
self.stats['notes_updated'] += 1
|
|
else:
|
|
# Add new note
|
|
if existing_notes:
|
|
provenance['notes'] = f"{existing_notes}\n{gap_note}"
|
|
else:
|
|
provenance['notes'] = gap_note
|
|
self.stats['notes_added'] += 1
|
|
|
|
return institution
|
|
|
|
def _save_institutions(self, institutions: list[dict]) -> None:
|
|
"""Save updated institutions to YAML file."""
|
|
# Add metadata comment at top
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
with open(self.output_file, 'w', encoding='utf-8') as f:
|
|
# Write header comment
|
|
f.write("---\n")
|
|
f.write("# Latin American GLAM Institutions - ISIL Gap Documented\n")
|
|
f.write(f"# Generated: {timestamp}\n")
|
|
f.write("#\n")
|
|
f.write("# ISIL Gap Documentation Summary:\n")
|
|
f.write(f"# - Total institutions: {self.stats['total_institutions']}\n")
|
|
f.write(f"# - New notes added: {self.stats['notes_added']}\n")
|
|
f.write(f"# - Notes updated: {self.stats['notes_updated']}\n")
|
|
f.write(f"# - Brazil (BR): {self.stats['by_country']['BR']} institutions\n")
|
|
f.write(f"# - Mexico (MX): {self.stats['by_country']['MX']} institutions\n")
|
|
f.write(f"# - Chile (CL): {self.stats['by_country']['CL']} institutions\n")
|
|
f.write("#\n")
|
|
f.write("# ISIL Research Findings:\n")
|
|
f.write("# - No public ISIL registries found for BR, MX, CL\n")
|
|
f.write("# - Wikidata enrichment (2025-11-06): 0 ISIL codes among 2,409 institutions\n")
|
|
f.write("# - Recommended: Direct outreach to national libraries\n")
|
|
f.write("# - Reference: docs/isil_enrichment_strategy.md\n")
|
|
f.write("\n")
|
|
|
|
# Write institutions as YAML
|
|
yaml.dump(
|
|
institutions,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=100,
|
|
indent=2
|
|
)
|
|
|
|
def _print_report(self) -> None:
|
|
"""Print summary report of gap documentation."""
|
|
print("\n" + "=" * 70)
|
|
print("ISIL GAP DOCUMENTATION COMPLETE")
|
|
print("=" * 70)
|
|
print(f"Total institutions: {self.stats['total_institutions']}")
|
|
print(f"New notes added: {self.stats['notes_added']}")
|
|
print(f"Notes updated: {self.stats['notes_updated']}")
|
|
print()
|
|
print("Documentation by country:")
|
|
print(f" Brazil (BR): {self.stats['by_country']['BR']} institutions")
|
|
print(f" Mexico (MX): {self.stats['by_country']['MX']} institutions")
|
|
print(f" Chile (CL): {self.stats['by_country']['CL']} institutions")
|
|
print()
|
|
print("Gap Notes Added:")
|
|
print("-" * 70)
|
|
for country, note in self.GAP_NOTES.items():
|
|
print(f"\n[{country}]")
|
|
print(note)
|
|
print()
|
|
print("=" * 70)
|
|
print(f"Output written to: {self.output_file}")
|
|
print("=" * 70)
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
# File paths
|
|
project_root = Path(__file__).parent.parent
|
|
input_file = project_root / "data" / "instances" / "latin_american_institutions_enriched.yaml"
|
|
output_file = project_root / "data" / "instances" / "latin_american_institutions_documented.yaml"
|
|
|
|
# Validate input exists
|
|
if not input_file.exists():
|
|
print(f"ERROR: Input file not found: {input_file}")
|
|
print("Expected: data/instances/latin_american_institutions_enriched.yaml")
|
|
sys.exit(1)
|
|
|
|
# Run documenter
|
|
documenter = ISILGapDocumenter(input_file, output_file)
|
|
documenter.run()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|