glam/scripts/enrich_with_lap_gaza_report.py

#!/usr/bin/env python3
"""
Enrich Palestinian heritage institution YAML files with LAP Gaza Report 2024 data.

This script adds structured `lap_gaza_report_enrichment` sections to YAML files
with proper XPath provenance from the archived HTML.

Usage:
    python scripts/enrich_with_lap_gaza_report.py [--dry-run]
"""

import json
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, List, Dict, Any
import argparse

# Project root
PROJECT_ROOT = Path(__file__).parent.parent

# Paths
CLAIMS_FILE = PROJECT_ROOT / "data" / "extracted" / "lap_gaza_claims.json"
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
ARCHIVED_HTML = "data/web/lap_gaza_report_2024/rendered.html"

# Source metadata from the extracted claims
SOURCE_URL = "https://librarianswithpalestine.org/gaza-report-2024/"
SOURCE_TITLE = "Israeli Damage to Archives, Libraries, and Museums in Gaza, October 2023-January 2024"
SOURCE_PUBLISHER = "Librarians and Archivists with Palestine"
REPORT_DATE = "2024-02-01"
CONTENT_HASH = "f4ac9e0797a381c3f939de62c03b5e2576eaf8a64580d677b3983c43c9b6104e"


def load_claims() -> Dict[str, Any]:
    """Load the extracted claims from JSON."""
    with open(CLAIMS_FILE, 'r', encoding='utf-8') as f:
        return json.load(f)


def find_institution_by_ghcid(claims: Dict[str, Any], ghcid: str) -> Optional[Dict[str, Any]]:
    """Find an institution in the claims by GHCID."""
    for inst in claims.get('institutions', []):
        if inst.get('ghcid') == ghcid:
            return inst
    return None


def find_martyr_for_institution(claims: Dict[str, Any], institution_ghcid: str) -> List[Dict[str, Any]]:
    """Find martyred information workers associated with an institution."""
    # Map of institution GHCIDs to related martyrs
    # Based on the report, we can link some martyrs to specific institutions
    martyr_mappings = {
        'PS-GZ-GAZ-L-ESL': ['Doaa Al-Masri'],  # Edward Said Library
        'PS-GZ-GAZ-L-QOULG': ['Dr. Jihad Suleiman Al-Masri'],  # Al-Quds Open University - Khan Yunis branch director
    }

    result: List[Dict[str, Any]] = []
    martyr_names = martyr_mappings.get(institution_ghcid, [])
    for martyr in claims.get('martyred_information_workers', []):
        if martyr.get('name') in martyr_names:
            result.append(martyr)
    return result


def build_lap_enrichment(institution: Dict[str, Any], martyrs: Optional[List[Dict[str, Any]]] = None) -> Dict[str, Any]:
    """Build the lap_gaza_report_enrichment section."""
    enrichment: Dict[str, Any] = {
        'source': {
            'url': SOURCE_URL,
            'title': SOURCE_TITLE,
            'publisher': SOURCE_PUBLISHER,
            'report_date': REPORT_DATE,
            'archived_html': ARCHIVED_HTML,
            'content_hash_sha256': CONTENT_HASH,
        },
        'damage_claim': {
            'status': institution.get('damage_status'),
            'date_of_damage': institution.get('date_of_damage'),
            'description': institution.get('description'),
            'xpath': institution.get('xpath'),
        },
        'sources': institution.get('sources', []),
        'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
        'extractor': 'lap_gaza_report_extractor.py',
        'data_tier': 'TIER_2_VERIFIED',
    }

    if martyrs:
        enrichment['related_martyred_workers'] = martyrs

    return enrichment


def update_yaml_file(file_path: Path, enrichment: Dict[str, Any], dry_run: bool = False) -> bool:
    """Update a YAML file with the LAP enrichment."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        data = yaml.safe_load(content)
        if data is None:
            print(f"  ERROR: Could not parse {file_path}")
            return False

        # Check if already enriched
        if 'lap_gaza_report_enrichment' in data:
            existing = data['lap_gaza_report_enrichment']
            if existing.get('damage_claim', {}).get('xpath'):
                print(f"  SKIP: Already has lap_gaza_report_enrichment with XPath")
                return False

        # Add the enrichment
        data['lap_gaza_report_enrichment'] = enrichment

        # Update provenance if it exists
        if 'provenance' in data and 'data_tier_summary' in data['provenance']:
            tier2_sources = data['provenance']['data_tier_summary'].get('TIER_2_VERIFIED', [])
            if 'lap_gaza_report_2024' not in tier2_sources:
                tier2_sources.append('lap_gaza_report_2024')
                data['provenance']['data_tier_summary']['TIER_2_VERIFIED'] = tier2_sources

        if dry_run:
            print(f"  DRY RUN: Would update {file_path}")
            print(f"    damage_status: {enrichment['damage_claim']['status']}")
            print(f"    xpath: {enrichment['damage_claim']['xpath']}")
            return True

        # Write back
        with open(file_path, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

        print(f"  UPDATED: {file_path}")
        return True

    except Exception as e:
        print(f"  ERROR: {file_path}: {e}")
        return False


def main() -> None:
    parser = argparse.ArgumentParser(description='Enrich PS-*.yaml files with LAP Gaza Report data')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
    args = parser.parse_args()

    print("=" * 70)
    print("LAP Gaza Report 2024 Enrichment")
    print("=" * 70)

    # Load claims
    claims = load_claims()
    print(f"Loaded {len(claims['institutions'])} institutions from claims")
    print(f"Loaded {len(claims['martyred_information_workers'])} martyred workers from claims")

    # Collect all PS-*.yaml files
    ps_files = list(CUSTODIAN_DIR.glob('PS-*.yaml'))
    print(f"Found {len(ps_files)} PS-*.yaml files")
    print("-" * 70)

    # Track statistics
    updated = 0
    skipped = 0
    not_in_report = 0
    errors = 0

    for file_path in sorted(ps_files):
        # Extract GHCID from filename
        ghcid = file_path.stem
        # Handle suffixed GHCIDs like PS-GZ-GAZ-M-SM-shahwan_museum
        if ghcid.count('-') > 4:
            # Take first 5 components
            parts = ghcid.split('-')
            ghcid = '-'.join(parts[:5])

        print(f"\n{file_path.name}")

        # Find in claims
        institution = find_institution_by_ghcid(claims, ghcid)

        if not institution:
            print(f"  NOT IN REPORT: No LAP Gaza Report entry for {ghcid}")
            not_in_report += 1
            continue

        # Find related martyrs
        martyrs = find_martyr_for_institution(claims, ghcid)

        # Build enrichment
        enrichment = build_lap_enrichment(institution, martyrs if martyrs else None)

        # Update file
        if update_yaml_file(file_path, enrichment, dry_run=args.dry_run):
            updated += 1
        else:
            skipped += 1

    print("\n" + "=" * 70)
    print("Summary")
    print("=" * 70)
    print(f"Updated: {updated}")
    print(f"Skipped (already enriched): {skipped}")
    print(f"Not in LAP report: {not_in_report}")
    print(f"Errors: {errors}")

    # List institutions in report but not matched
    print("\n" + "-" * 70)
    print("Institutions in LAP report needing YAML files:")
    print("-" * 70)

    existing_ghcids = set()
    for f in ps_files:
        ghcid = f.stem
        if ghcid.count('-') > 4:
            parts = ghcid.split('-')
            ghcid = '-'.join(parts[:5])
        existing_ghcids.add(ghcid)

    for inst in claims['institutions']:
        ghcid = inst.get('ghcid')
        if ghcid is None:
            print(f"  NEEDS YAML: {inst['institution_name']} (no GHCID assigned)")
        elif ghcid not in existing_ghcids:
            print(f"  NEEDS YAML: {inst['institution_name']} ({ghcid})")


if __name__ == '__main__':
    main()