227 lines
8 KiB
Python
227 lines
8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Palestinian heritage institution YAML files with LAP Gaza Report 2024 data.
|
|
|
|
This script adds structured `lap_gaza_report_enrichment` sections to YAML files
|
|
with proper XPath provenance from the archived HTML.
|
|
|
|
Usage:
|
|
python scripts/enrich_with_lap_gaza_report.py [--dry-run]
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, List, Dict, Any
|
|
import argparse
|
|
|
|
# Project root
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
|
|
# Paths
|
|
CLAIMS_FILE = PROJECT_ROOT / "data" / "extracted" / "lap_gaza_claims.json"
|
|
CUSTODIAN_DIR = PROJECT_ROOT / "data" / "custodian"
|
|
ARCHIVED_HTML = "data/web/lap_gaza_report_2024/rendered.html"
|
|
|
|
# Source metadata from the extracted claims
|
|
SOURCE_URL = "https://librarianswithpalestine.org/gaza-report-2024/"
|
|
SOURCE_TITLE = "Israeli Damage to Archives, Libraries, and Museums in Gaza, October 2023-January 2024"
|
|
SOURCE_PUBLISHER = "Librarians and Archivists with Palestine"
|
|
REPORT_DATE = "2024-02-01"
|
|
CONTENT_HASH = "f4ac9e0797a381c3f939de62c03b5e2576eaf8a64580d677b3983c43c9b6104e"
|
|
|
|
|
|
def load_claims() -> Dict[str, Any]:
|
|
"""Load the extracted claims from JSON."""
|
|
with open(CLAIMS_FILE, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
|
|
def find_institution_by_ghcid(claims: Dict[str, Any], ghcid: str) -> Optional[Dict[str, Any]]:
|
|
"""Find an institution in the claims by GHCID."""
|
|
for inst in claims.get('institutions', []):
|
|
if inst.get('ghcid') == ghcid:
|
|
return inst
|
|
return None
|
|
|
|
|
|
def find_martyr_for_institution(claims: Dict[str, Any], institution_ghcid: str) -> List[Dict[str, Any]]:
|
|
"""Find martyred information workers associated with an institution."""
|
|
# Map of institution GHCIDs to related martyrs
|
|
# Based on the report, we can link some martyrs to specific institutions
|
|
martyr_mappings = {
|
|
'PS-GZ-GAZ-L-ESL': ['Doaa Al-Masri'], # Edward Said Library
|
|
'PS-GZ-GAZ-L-QOULG': ['Dr. Jihad Suleiman Al-Masri'], # Al-Quds Open University - Khan Yunis branch director
|
|
}
|
|
|
|
result: List[Dict[str, Any]] = []
|
|
martyr_names = martyr_mappings.get(institution_ghcid, [])
|
|
for martyr in claims.get('martyred_information_workers', []):
|
|
if martyr.get('name') in martyr_names:
|
|
result.append(martyr)
|
|
return result
|
|
|
|
|
|
def build_lap_enrichment(institution: Dict[str, Any], martyrs: Optional[List[Dict[str, Any]]] = None) -> Dict[str, Any]:
|
|
"""Build the lap_gaza_report_enrichment section."""
|
|
enrichment: Dict[str, Any] = {
|
|
'source': {
|
|
'url': SOURCE_URL,
|
|
'title': SOURCE_TITLE,
|
|
'publisher': SOURCE_PUBLISHER,
|
|
'report_date': REPORT_DATE,
|
|
'archived_html': ARCHIVED_HTML,
|
|
'content_hash_sha256': CONTENT_HASH,
|
|
},
|
|
'damage_claim': {
|
|
'status': institution.get('damage_status'),
|
|
'date_of_damage': institution.get('date_of_damage'),
|
|
'description': institution.get('description'),
|
|
'xpath': institution.get('xpath'),
|
|
},
|
|
'sources': institution.get('sources', []),
|
|
'extraction_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'extractor': 'lap_gaza_report_extractor.py',
|
|
'data_tier': 'TIER_2_VERIFIED',
|
|
}
|
|
|
|
if martyrs:
|
|
enrichment['related_martyred_workers'] = martyrs
|
|
|
|
return enrichment
|
|
|
|
|
|
def update_yaml_file(file_path: Path, enrichment: Dict[str, Any], dry_run: bool = False) -> bool:
|
|
"""Update a YAML file with the LAP enrichment."""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
data = yaml.safe_load(content)
|
|
if data is None:
|
|
print(f" ERROR: Could not parse {file_path}")
|
|
return False
|
|
|
|
# Check if already enriched
|
|
if 'lap_gaza_report_enrichment' in data:
|
|
existing = data['lap_gaza_report_enrichment']
|
|
if existing.get('damage_claim', {}).get('xpath'):
|
|
print(f" SKIP: Already has lap_gaza_report_enrichment with XPath")
|
|
return False
|
|
|
|
# Add the enrichment
|
|
data['lap_gaza_report_enrichment'] = enrichment
|
|
|
|
# Update provenance if it exists
|
|
if 'provenance' in data and 'data_tier_summary' in data['provenance']:
|
|
tier2_sources = data['provenance']['data_tier_summary'].get('TIER_2_VERIFIED', [])
|
|
if 'lap_gaza_report_2024' not in tier2_sources:
|
|
tier2_sources.append('lap_gaza_report_2024')
|
|
data['provenance']['data_tier_summary']['TIER_2_VERIFIED'] = tier2_sources
|
|
|
|
if dry_run:
|
|
print(f" DRY RUN: Would update {file_path}")
|
|
print(f" damage_status: {enrichment['damage_claim']['status']}")
|
|
print(f" xpath: {enrichment['damage_claim']['xpath']}")
|
|
return True
|
|
|
|
# Write back
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f" UPDATED: {file_path}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f" ERROR: {file_path}: {e}")
|
|
return False
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description='Enrich PS-*.yaml files with LAP Gaza Report data')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 70)
|
|
print("LAP Gaza Report 2024 Enrichment")
|
|
print("=" * 70)
|
|
|
|
# Load claims
|
|
claims = load_claims()
|
|
print(f"Loaded {len(claims['institutions'])} institutions from claims")
|
|
print(f"Loaded {len(claims['martyred_information_workers'])} martyred workers from claims")
|
|
|
|
# Collect all PS-*.yaml files
|
|
ps_files = list(CUSTODIAN_DIR.glob('PS-*.yaml'))
|
|
print(f"Found {len(ps_files)} PS-*.yaml files")
|
|
print("-" * 70)
|
|
|
|
# Track statistics
|
|
updated = 0
|
|
skipped = 0
|
|
not_in_report = 0
|
|
errors = 0
|
|
|
|
for file_path in sorted(ps_files):
|
|
# Extract GHCID from filename
|
|
ghcid = file_path.stem
|
|
# Handle suffixed GHCIDs like PS-GZ-GAZ-M-SM-shahwan_museum
|
|
if ghcid.count('-') > 4:
|
|
# Take first 5 components
|
|
parts = ghcid.split('-')
|
|
ghcid = '-'.join(parts[:5])
|
|
|
|
print(f"\n{file_path.name}")
|
|
|
|
# Find in claims
|
|
institution = find_institution_by_ghcid(claims, ghcid)
|
|
|
|
if not institution:
|
|
print(f" NOT IN REPORT: No LAP Gaza Report entry for {ghcid}")
|
|
not_in_report += 1
|
|
continue
|
|
|
|
# Find related martyrs
|
|
martyrs = find_martyr_for_institution(claims, ghcid)
|
|
|
|
# Build enrichment
|
|
enrichment = build_lap_enrichment(institution, martyrs if martyrs else None)
|
|
|
|
# Update file
|
|
if update_yaml_file(file_path, enrichment, dry_run=args.dry_run):
|
|
updated += 1
|
|
else:
|
|
skipped += 1
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Summary")
|
|
print("=" * 70)
|
|
print(f"Updated: {updated}")
|
|
print(f"Skipped (already enriched): {skipped}")
|
|
print(f"Not in LAP report: {not_in_report}")
|
|
print(f"Errors: {errors}")
|
|
|
|
# List institutions in report but not matched
|
|
print("\n" + "-" * 70)
|
|
print("Institutions in LAP report needing YAML files:")
|
|
print("-" * 70)
|
|
|
|
existing_ghcids = set()
|
|
for f in ps_files:
|
|
ghcid = f.stem
|
|
if ghcid.count('-') > 4:
|
|
parts = ghcid.split('-')
|
|
ghcid = '-'.join(parts[:5])
|
|
existing_ghcids.add(ghcid)
|
|
|
|
for inst in claims['institutions']:
|
|
ghcid = inst.get('ghcid')
|
|
if ghcid is None:
|
|
print(f" NEEDS YAML: {inst['institution_name']} (no GHCID assigned)")
|
|
elif ghcid not in existing_ghcids:
|
|
print(f" NEEDS YAML: {inst['institution_name']} ({ghcid})")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|