glam/scripts/enrich_with_lap_gaza_report.py
2025-12-09 07:56:35 +01:00

227 lines
8 KiB
Python

#!/usr/bin/env python3
"""
Enrich Palestinian heritage institution YAML files with LAP Gaza Report 2024 data.
This script adds structured `lap_gaza_report_enrichment` sections to YAML files
with proper XPath provenance from the archived HTML.
Usage:
python scripts/enrich_with_lap_gaza_report.py [--dry-run]
"""
import json
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, List, Dict, Any
import argparse
# Project root
PROJECT_ROOT = Path(__file__).parent.parent
# Paths
CLAIMS_FILE = PROJECT_ROOT / "data" / "extracted" / "lap_gaza_claims.json"
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
ARCHIVED_HTML = "data/web/lap_gaza_report_2024/rendered.html"
# Source metadata from the extracted claims
SOURCE_URL = "https://librarianswithpalestine.org/gaza-report-2024/"
SOURCE_TITLE = "Israeli Damage to Archives, Libraries, and Museums in Gaza, October 2023-January 2024"
SOURCE_PUBLISHER = "Librarians and Archivists with Palestine"
REPORT_DATE = "2024-02-01"
CONTENT_HASH = "f4ac9e0797a381c3f939de62c03b5e2576eaf8a64580d677b3983c43c9b6104e"
def load_claims() -> Dict[str, Any]:
"""Load the extracted claims from JSON."""
with open(CLAIMS_FILE, 'r', encoding='utf-8') as f:
return json.load(f)
def find_institution_by_ghcid(claims: Dict[str, Any], ghcid: str) -> Optional[Dict[str, Any]]:
"""Find an institution in the claims by GHCID."""
for inst in claims.get('institutions', []):
if inst.get('ghcid') == ghcid:
return inst
return None
def find_martyr_for_institution(claims: Dict[str, Any], institution_ghcid: str) -> List[Dict[str, Any]]:
"""Find martyred information workers associated with an institution."""
# Map of institution GHCIDs to related martyrs
# Based on the report, we can link some martyrs to specific institutions
martyr_mappings = {
'PS-GZ-GAZ-L-ESL': ['Doaa Al-Masri'], # Edward Said Library
'PS-GZ-GAZ-L-QOULG': ['Dr. Jihad Suleiman Al-Masri'], # Al-Quds Open University - Khan Yunis branch director
}
result: List[Dict[str, Any]] = []
martyr_names = martyr_mappings.get(institution_ghcid, [])
for martyr in claims.get('martyred_information_workers', []):
if martyr.get('name') in martyr_names:
result.append(martyr)
return result
def build_lap_enrichment(institution: Dict[str, Any], martyrs: Optional[List[Dict[str, Any]]] = None) -> Dict[str, Any]:
"""Build the lap_gaza_report_enrichment section."""
enrichment: Dict[str, Any] = {
'source': {
'url': SOURCE_URL,
'title': SOURCE_TITLE,
'publisher': SOURCE_PUBLISHER,
'report_date': REPORT_DATE,
'archived_html': ARCHIVED_HTML,
'content_hash_sha256': CONTENT_HASH,
},
'damage_claim': {
'status': institution.get('damage_status'),
'date_of_damage': institution.get('date_of_damage'),
'description': institution.get('description'),
'xpath': institution.get('xpath'),
},
'sources': institution.get('sources', []),
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
'extractor': 'lap_gaza_report_extractor.py',
'data_tier': 'TIER_2_VERIFIED',
}
if martyrs:
enrichment['related_martyred_workers'] = martyrs
return enrichment
def update_yaml_file(file_path: Path, enrichment: Dict[str, Any], dry_run: bool = False) -> bool:
"""Update a YAML file with the LAP enrichment."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
data = yaml.safe_load(content)
if data is None:
print(f" ERROR: Could not parse {file_path}")
return False
# Check if already enriched
if 'lap_gaza_report_enrichment' in data:
existing = data['lap_gaza_report_enrichment']
if existing.get('damage_claim', {}).get('xpath'):
print(f" SKIP: Already has lap_gaza_report_enrichment with XPath")
return False
# Add the enrichment
data['lap_gaza_report_enrichment'] = enrichment
# Update provenance if it exists
if 'provenance' in data and 'data_tier_summary' in data['provenance']:
tier2_sources = data['provenance']['data_tier_summary'].get('TIER_2_VERIFIED', [])
if 'lap_gaza_report_2024' not in tier2_sources:
tier2_sources.append('lap_gaza_report_2024')
data['provenance']['data_tier_summary']['TIER_2_VERIFIED'] = tier2_sources
if dry_run:
print(f" DRY RUN: Would update {file_path}")
print(f" damage_status: {enrichment['damage_claim']['status']}")
print(f" xpath: {enrichment['damage_claim']['xpath']}")
return True
# Write back
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f" UPDATED: {file_path}")
return True
except Exception as e:
print(f" ERROR: {file_path}: {e}")
return False
def main() -> None:
parser = argparse.ArgumentParser(description='Enrich PS-*.yaml files with LAP Gaza Report data')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
args = parser.parse_args()
print("=" * 70)
print("LAP Gaza Report 2024 Enrichment")
print("=" * 70)
# Load claims
claims = load_claims()
print(f"Loaded {len(claims['institutions'])} institutions from claims")
print(f"Loaded {len(claims['martyred_information_workers'])} martyred workers from claims")
# Collect all PS-*.yaml files
ps_files = list(CUSTODIAN_DIR.glob('PS-*.yaml'))
print(f"Found {len(ps_files)} PS-*.yaml files")
print("-" * 70)
# Track statistics
updated = 0
skipped = 0
not_in_report = 0
errors = 0
for file_path in sorted(ps_files):
# Extract GHCID from filename
ghcid = file_path.stem
# Handle suffixed GHCIDs like PS-GZ-GAZ-M-SM-shahwan_museum
if ghcid.count('-') > 4:
# Take first 5 components
parts = ghcid.split('-')
ghcid = '-'.join(parts[:5])
print(f"\n{file_path.name}")
# Find in claims
institution = find_institution_by_ghcid(claims, ghcid)
if not institution:
print(f" NOT IN REPORT: No LAP Gaza Report entry for {ghcid}")
not_in_report += 1
continue
# Find related martyrs
martyrs = find_martyr_for_institution(claims, ghcid)
# Build enrichment
enrichment = build_lap_enrichment(institution, martyrs if martyrs else None)
# Update file
if update_yaml_file(file_path, enrichment, dry_run=args.dry_run):
updated += 1
else:
skipped += 1
print("\n" + "=" * 70)
print("Summary")
print("=" * 70)
print(f"Updated: {updated}")
print(f"Skipped (already enriched): {skipped}")
print(f"Not in LAP report: {not_in_report}")
print(f"Errors: {errors}")
# List institutions in report but not matched
print("\n" + "-" * 70)
print("Institutions in LAP report needing YAML files:")
print("-" * 70)
existing_ghcids = set()
for f in ps_files:
ghcid = f.stem
if ghcid.count('-') > 4:
parts = ghcid.split('-')
ghcid = '-'.join(parts[:5])
existing_ghcids.add(ghcid)
for inst in claims['institutions']:
ghcid = inst.get('ghcid')
if ghcid is None:
print(f" NEEDS YAML: {inst['institution_name']} (no GHCID assigned)")
elif ghcid not in existing_ghcids:
print(f" NEEDS YAML: {inst['institution_name']} ({ghcid})")
if __name__ == '__main__':
main()