#!/usr/bin/env python3 """ Enrich Palestinian heritage institution YAML files with LAP Gaza Report 2024 data. This script adds structured `lap_gaza_report_enrichment` sections to YAML files with proper XPath provenance from the archived HTML. Usage: python scripts/enrich_with_lap_gaza_report.py [--dry-run] """ import json import yaml from pathlib import Path from datetime import datetime, timezone from typing import Optional, List, Dict, Any import argparse # Project root PROJECT_ROOT = Path(__file__).parent.parent # Paths CLAIMS_FILE = PROJECT_ROOT / "data" / "extracted" / "lap_gaza_claims.json" CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian" ARCHIVED_HTML = "data/web/lap_gaza_report_2024/rendered.html" # Source metadata from the extracted claims SOURCE_URL = "https://librarianswithpalestine.org/gaza-report-2024/" SOURCE_TITLE = "Israeli Damage to Archives, Libraries, and Museums in Gaza, October 2023-January 2024" SOURCE_PUBLISHER = "Librarians and Archivists with Palestine" REPORT_DATE = "2024-02-01" CONTENT_HASH = "f4ac9e0797a381c3f939de62c03b5e2576eaf8a64580d677b3983c43c9b6104e" def load_claims() -> Dict[str, Any]: """Load the extracted claims from JSON.""" with open(CLAIMS_FILE, 'r', encoding='utf-8') as f: return json.load(f) def find_institution_by_ghcid(claims: Dict[str, Any], ghcid: str) -> Optional[Dict[str, Any]]: """Find an institution in the claims by GHCID.""" for inst in claims.get('institutions', []): if inst.get('ghcid') == ghcid: return inst return None def find_martyr_for_institution(claims: Dict[str, Any], institution_ghcid: str) -> List[Dict[str, Any]]: """Find martyred information workers associated with an institution.""" # Map of institution GHCIDs to related martyrs # Based on the report, we can link some martyrs to specific institutions martyr_mappings = { 'PS-GZ-GAZ-L-ESL': ['Doaa Al-Masri'], # Edward Said Library 'PS-GZ-GAZ-L-QOULG': ['Dr. Jihad Suleiman Al-Masri'], # Al-Quds Open University - Khan Yunis branch director } result: List[Dict[str, Any]] = [] martyr_names = martyr_mappings.get(institution_ghcid, []) for martyr in claims.get('martyred_information_workers', []): if martyr.get('name') in martyr_names: result.append(martyr) return result def build_lap_enrichment(institution: Dict[str, Any], martyrs: Optional[List[Dict[str, Any]]] = None) -> Dict[str, Any]: """Build the lap_gaza_report_enrichment section.""" enrichment: Dict[str, Any] = { 'source': { 'url': SOURCE_URL, 'title': SOURCE_TITLE, 'publisher': SOURCE_PUBLISHER, 'report_date': REPORT_DATE, 'archived_html': ARCHIVED_HTML, 'content_hash_sha256': CONTENT_HASH, }, 'damage_claim': { 'status': institution.get('damage_status'), 'date_of_damage': institution.get('date_of_damage'), 'description': institution.get('description'), 'xpath': institution.get('xpath'), }, 'sources': institution.get('sources', []), 'extraction_timestamp': datetime.now(timezone.utc).isoformat(), 'extractor': 'lap_gaza_report_extractor.py', 'data_tier': 'TIER_2_VERIFIED', } if martyrs: enrichment['related_martyred_workers'] = martyrs return enrichment def update_yaml_file(file_path: Path, enrichment: Dict[str, Any], dry_run: bool = False) -> bool: """Update a YAML file with the LAP enrichment.""" try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() data = yaml.safe_load(content) if data is None: print(f" ERROR: Could not parse {file_path}") return False # Check if already enriched if 'lap_gaza_report_enrichment' in data: existing = data['lap_gaza_report_enrichment'] if existing.get('damage_claim', {}).get('xpath'): print(f" SKIP: Already has lap_gaza_report_enrichment with XPath") return False # Add the enrichment data['lap_gaza_report_enrichment'] = enrichment # Update provenance if it exists if 'provenance' in data and 'data_tier_summary' in data['provenance']: tier2_sources = data['provenance']['data_tier_summary'].get('TIER_2_VERIFIED', []) if 'lap_gaza_report_2024' not in tier2_sources: tier2_sources.append('lap_gaza_report_2024') data['provenance']['data_tier_summary']['TIER_2_VERIFIED'] = tier2_sources if dry_run: print(f" DRY RUN: Would update {file_path}") print(f" damage_status: {enrichment['damage_claim']['status']}") print(f" xpath: {enrichment['damage_claim']['xpath']}") return True # Write back with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f" UPDATED: {file_path}") return True except Exception as e: print(f" ERROR: {file_path}: {e}") return False def main() -> None: parser = argparse.ArgumentParser(description='Enrich PS-*.yaml files with LAP Gaza Report data') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') args = parser.parse_args() print("=" * 70) print("LAP Gaza Report 2024 Enrichment") print("=" * 70) # Load claims claims = load_claims() print(f"Loaded {len(claims['institutions'])} institutions from claims") print(f"Loaded {len(claims['martyred_information_workers'])} martyred workers from claims") # Collect all PS-*.yaml files ps_files = list(CUSTODIAN_DIR.glob('PS-*.yaml')) print(f"Found {len(ps_files)} PS-*.yaml files") print("-" * 70) # Track statistics updated = 0 skipped = 0 not_in_report = 0 errors = 0 for file_path in sorted(ps_files): # Extract GHCID from filename ghcid = file_path.stem # Handle suffixed GHCIDs like PS-GZ-GAZ-M-SM-shahwan_museum if ghcid.count('-') > 4: # Take first 5 components parts = ghcid.split('-') ghcid = '-'.join(parts[:5]) print(f"\n{file_path.name}") # Find in claims institution = find_institution_by_ghcid(claims, ghcid) if not institution: print(f" NOT IN REPORT: No LAP Gaza Report entry for {ghcid}") not_in_report += 1 continue # Find related martyrs martyrs = find_martyr_for_institution(claims, ghcid) # Build enrichment enrichment = build_lap_enrichment(institution, martyrs if martyrs else None) # Update file if update_yaml_file(file_path, enrichment, dry_run=args.dry_run): updated += 1 else: skipped += 1 print("\n" + "=" * 70) print("Summary") print("=" * 70) print(f"Updated: {updated}") print(f"Skipped (already enriched): {skipped}") print(f"Not in LAP report: {not_in_report}") print(f"Errors: {errors}") # List institutions in report but not matched print("\n" + "-" * 70) print("Institutions in LAP report needing YAML files:") print("-" * 70) existing_ghcids = set() for f in ps_files: ghcid = f.stem if ghcid.count('-') > 4: parts = ghcid.split('-') ghcid = '-'.join(parts[:5]) existing_ghcids.add(ghcid) for inst in claims['institutions']: ghcid = inst.get('ghcid') if ghcid is None: print(f" NEEDS YAML: {inst['institution_name']} (no GHCID assigned)") elif ghcid not in existing_ghcids: print(f" NEEDS YAML: {inst['institution_name']} ({ghcid})") if __name__ == '__main__': main()