#!/usr/bin/env python3 """ Add ISIL gap documentation to Latin American institution provenance records. This script: 1. Reads the Wikidata-enriched Latin American institutions dataset 2. Adds standardized provenance notes explaining ISIL unavailability 3. Documents Wikidata enrichment results (0 ISIL codes found among 2,409 institutions) 4. References the enrichment strategy document for details 5. Writes updated YAML with comprehensive gap documentation Context: - Brazil, Mexico, and Chile lack publicly accessible ISIL registries - Wikidata enrichment (2025-11-06) confirmed ISIL unavailability - 0 ISIL codes found among 2,409 Wikidata GLAM institutions in these countries Usage: python scripts/add_isil_gap_notes.py Input: data/instances/latin_american_institutions_enriched.yaml (304 institutions) Output: data/instances/latin_american_institutions_documented.yaml """ import sys from pathlib import Path from datetime import datetime, timezone import yaml # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) class ISILGapDocumenter: """Add ISIL gap documentation to institution provenance records.""" # Country-specific ISIL gap notes GAP_NOTES = { 'BR': ( "ISIL code unavailable: No public ISIL registry exists for Brazil. " "Wikidata enrichment (2025-11-06) found 0 ISIL codes among 2,409 " "Brazilian GLAM institutions. National registration agency (Biblioteca " "Nacional do Brasil or IBICT) has not published public ISIL directory. " "Recommended action: Direct outreach to Biblioteca Nacional do Brasil. " "See docs/isil_enrichment_strategy.md for details." ), 'MX': ( "ISIL code unavailable: No public ISIL registry exists for Mexico. " "Wikidata enrichment (2025-11-06) found 0 ISIL codes among 2,409 " "Mexican GLAM institutions. National registration agency (Biblioteca " "Nacional de México under UNAM) has not published public ISIL directory. " "Recommended action: Direct outreach to Biblioteca Nacional de México. " "See docs/isil_enrichment_strategy.md for details." ), 'CL': ( "ISIL code unavailable: No public ISIL registry exists for Chile. " "Wikidata enrichment (2025-11-06) found 0 ISIL codes among 2,409 " "Chilean GLAM institutions. National registration agency (Biblioteca " "Nacional de Chile or Servicio Nacional del Patrimonio Cultural) has not " "published public ISIL directory. Recommended action: Direct outreach to " "Biblioteca Nacional de Chile. See docs/isil_enrichment_strategy.md for details." ) } def __init__(self, input_file: Path, output_file: Path): self.input_file = input_file self.output_file = output_file self.stats = { 'total_institutions': 0, 'notes_added': 0, 'notes_updated': 0, 'by_country': {'BR': 0, 'MX': 0, 'CL': 0} } def run(self) -> None: """Main execution: read, process, write, report.""" print(f"Reading institutions from: {self.input_file}") institutions = self._load_institutions() print("Adding ISIL gap documentation to provenance records...") updated_institutions = self._add_gap_notes(institutions) print(f"Writing documented institutions to: {self.output_file}") self._save_institutions(updated_institutions) self._print_report() def _load_institutions(self) -> list[dict]: """Load institutions from YAML file.""" with open(self.input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) if not isinstance(institutions, list): raise ValueError(f"Expected list of institutions, got {type(institutions)}") self.stats['total_institutions'] = len(institutions) return institutions def _add_gap_notes(self, institutions: list[dict]) -> list[dict]: """Add ISIL gap notes to each institution's provenance.""" updated = [] for inst in institutions: country = self._get_country_code(inst) if country and country in self.GAP_NOTES: inst = self._update_provenance_notes(inst, country) self.stats['by_country'][country] += 1 updated.append(inst) return updated def _get_country_code(self, institution: dict) -> str | None: """Extract country code from institution locations.""" locations = institution.get('locations', []) if not locations: return None # Get country from first location return locations[0].get('country') def _update_provenance_notes(self, institution: dict, country: str) -> dict: """Add or update provenance notes with ISIL gap information.""" # Ensure provenance exists if 'provenance' not in institution: institution['provenance'] = {} provenance = institution['provenance'] existing_notes = provenance.get('notes', '').strip() gap_note = self.GAP_NOTES[country] # Check if ISIL gap note already present if 'ISIL code unavailable' in existing_notes: # Update existing note # Replace old note with new standardized version lines = existing_notes.split('\n') filtered = [line for line in lines if 'ISIL code unavailable' not in line] filtered.append(gap_note) provenance['notes'] = '\n'.join(filtered).strip() self.stats['notes_updated'] += 1 else: # Add new note if existing_notes: provenance['notes'] = f"{existing_notes}\n{gap_note}" else: provenance['notes'] = gap_note self.stats['notes_added'] += 1 return institution def _save_institutions(self, institutions: list[dict]) -> None: """Save updated institutions to YAML file.""" # Add metadata comment at top timestamp = datetime.now(timezone.utc).isoformat() with open(self.output_file, 'w', encoding='utf-8') as f: # Write header comment f.write("---\n") f.write("# Latin American GLAM Institutions - ISIL Gap Documented\n") f.write(f"# Generated: {timestamp}\n") f.write("#\n") f.write("# ISIL Gap Documentation Summary:\n") f.write(f"# - Total institutions: {self.stats['total_institutions']}\n") f.write(f"# - New notes added: {self.stats['notes_added']}\n") f.write(f"# - Notes updated: {self.stats['notes_updated']}\n") f.write(f"# - Brazil (BR): {self.stats['by_country']['BR']} institutions\n") f.write(f"# - Mexico (MX): {self.stats['by_country']['MX']} institutions\n") f.write(f"# - Chile (CL): {self.stats['by_country']['CL']} institutions\n") f.write("#\n") f.write("# ISIL Research Findings:\n") f.write("# - No public ISIL registries found for BR, MX, CL\n") f.write("# - Wikidata enrichment (2025-11-06): 0 ISIL codes among 2,409 institutions\n") f.write("# - Recommended: Direct outreach to national libraries\n") f.write("# - Reference: docs/isil_enrichment_strategy.md\n") f.write("\n") # Write institutions as YAML yaml.dump( institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100, indent=2 ) def _print_report(self) -> None: """Print summary report of gap documentation.""" print("\n" + "=" * 70) print("ISIL GAP DOCUMENTATION COMPLETE") print("=" * 70) print(f"Total institutions: {self.stats['total_institutions']}") print(f"New notes added: {self.stats['notes_added']}") print(f"Notes updated: {self.stats['notes_updated']}") print() print("Documentation by country:") print(f" Brazil (BR): {self.stats['by_country']['BR']} institutions") print(f" Mexico (MX): {self.stats['by_country']['MX']} institutions") print(f" Chile (CL): {self.stats['by_country']['CL']} institutions") print() print("Gap Notes Added:") print("-" * 70) for country, note in self.GAP_NOTES.items(): print(f"\n[{country}]") print(note) print() print("=" * 70) print(f"Output written to: {self.output_file}") print("=" * 70) def main(): """Main entry point.""" # File paths project_root = Path(__file__).parent.parent input_file = project_root / "data" / "instances" / "latin_american_institutions_enriched.yaml" output_file = project_root / "data" / "instances" / "latin_american_institutions_documented.yaml" # Validate input exists if not input_file.exists(): print(f"ERROR: Input file not found: {input_file}") print("Expected: data/instances/latin_american_institutions_enriched.yaml") sys.exit(1) # Run documenter documenter = ISILGapDocumenter(input_file, output_file) documenter.run() if __name__ == "__main__": main()