#!/usr/bin/env python3 """ Enrich Palestinian heritage institution files with service_area data. Maps our region codes to GADM admin2 (governorate) names for werkgebied display. """ import os import sys from pathlib import Path from datetime import datetime, timezone import yaml # Mapping from our region codes to GADM admin2 governorate names # Our codes are derived from city/region abbreviations # GADM names are from the boundaries API REGION_TO_GOVERNORATE = { # Gaza Strip (admin1: PSE.1_1) 'GZ': 'Gaza', # Gaza City area 'GZA': 'GazaashShamaliyah', # North Gaza (Gaza ash-Shamaliyah) 'KY': 'KhanYunis', # Khan Yunis 'RAF': 'Rafah', # Rafah 'DEB': 'DeirAl-Balah', # Deir al-Balah (central Gaza) # West Bank (admin1: PSE.2_1) 'BTH': 'Bethlehem', # Bethlehem 'HBN': 'Hebron', # Hebron 'JEM': 'Jerusalem', # Jerusalem (East Jerusalem) 'RBH': 'RamallahandAl-Bireh', # Ramallah and Al-Bireh 'NBS': 'Nablus', # Nablus 'JEN': 'Jenin', # Jenin 'JER': 'Jericho', # Jericho 'TKM': 'Tulkarm', # Tulkarm 'QAL': 'Qalqilya', # Qalqilya 'SAL': 'Salfit', # Salfit 'TUB': 'Tubas', # Tubas 'WE': 'RamallahandAl-Bireh', # West Bank general → default to Ramallah } # City to governorate mapping for cases where region code is ambiguous CITY_TO_GOVERNORATE = { 'Gaza City': 'Gaza', 'Gaza': 'Gaza', 'Khan Yunis': 'KhanYunis', 'Rafah': 'Rafah', 'Deir al-Balah': 'DeirAl-Balah', 'Beit Hanoun': 'GazaashShamaliyah', # North Gaza 'Beit Lahia': 'GazaashShamaliyah', # North Gaza 'Jabalia': 'GazaashShamaliyah', # North Gaza 'Bethlehem': 'Bethlehem', 'Hebron': 'Hebron', 'Jerusalem': 'Jerusalem', 'East Jerusalem': 'Jerusalem', 'Ramallah': 'RamallahandAl-Bireh', 'Al-Bireh': 'RamallahandAl-Bireh', 'Nablus': 'Nablus', 'Jenin': 'Jenin', 'Jericho': 'Jericho', 'Tulkarm': 'Tulkarm', 'Qalqilya': 'Qalqilya', 'Salfit': 'Salfit', 'Tubas': 'Tubas', } # Admin1 codes for region grouping GOVERNORATE_TO_ADMIN1 = { # Gaza Strip 'Gaza': 'PSE.1_1', 'GazaashShamaliyah': 'PSE.1_1', 'KhanYunis': 'PSE.1_1', 'Rafah': 'PSE.1_1', 'DeirAl-Balah': 'PSE.1_1', # West Bank 'Bethlehem': 'PSE.2_1', 'Hebron': 'PSE.2_1', 'Jerusalem': 'PSE.2_1', 'RamallahandAl-Bireh': 'PSE.2_1', 'Nablus': 'PSE.2_1', 'Jenin': 'PSE.2_1', 'Jericho': 'PSE.2_1', 'Tulkarm': 'PSE.2_1', 'Qalqilya': 'PSE.2_1', 'Salfit': 'PSE.2_1', 'Tubas': 'PSE.2_1', } # Governorate name to database ID (from API /boundaries/countries/PS/admin2) GOVERNORATE_TO_ID = { # Gaza Strip 'DeirAl-Balah': 1, 'Gaza': 2, 'GazaashShamaliyah': 3, 'KhanYunis': 4, 'Rafah': 5, # West Bank 'Bethlehem': 6, 'Hebron': 7, 'Jenin': 8, 'Jericho': 9, 'Jerusalem': 10, 'Nablus': 11, 'Qalqilya': 12, 'RamallahandAl-Bireh': 13, 'Salfit': 14, 'Tubas': 15, 'Tulkarm': 16, } def determine_governorate(entry: dict) -> str | None: """Determine the governorate name from entry data.""" # Try location_resolution first loc_res = entry.get('ghcid', {}).get('location_resolution', {}) # 1. Try city_name → governorate mapping city_name = loc_res.get('city_name') if city_name and city_name in CITY_TO_GOVERNORATE: return CITY_TO_GOVERNORATE[city_name] # 2. Try region_code → governorate mapping region_code = loc_res.get('region_code') if region_code and region_code in REGION_TO_GOVERNORATE: return REGION_TO_GOVERNORATE[region_code] # 3. Try original_entry city orig_city = entry.get('original_entry', {}).get('city') if orig_city and orig_city in CITY_TO_GOVERNORATE: return CITY_TO_GOVERNORATE[orig_city] return None def create_service_area(governorate: str) -> dict: """Create a service_area dict for the given governorate.""" admin1_code = GOVERNORATE_TO_ADMIN1.get(governorate) admin1_name = 'Gaza Strip' if admin1_code == 'PSE.1_1' else 'West Bank' admin2_id = GOVERNORATE_TO_ID.get(governorate) return { 'type': 'admin2', 'country_code': 'PS', 'admin1_code': admin1_code, 'admin1_name': admin1_name, 'admin2_id': admin2_id, 'admin2_name': governorate, 'source': 'GADM v4.1', 'api_endpoint': f'http://91.98.224.44:8001/boundaries/admin2/{admin2_id}/geojson', 'enrichment_timestamp': datetime.now(timezone.utc).isoformat(), 'notes': f'Heritage institution service area based on location in {governorate} governorate.' } def enrich_file(filepath: Path, dry_run: bool = False) -> tuple[bool, str]: """Enrich a single file with service_area data. Returns (success, message). """ with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if data is None: return False, "Empty or invalid YAML" # Check if already has service_area if 'service_area' in data: return False, "Already has service_area" # Determine governorate governorate = determine_governorate(data) if not governorate: return False, "Could not determine governorate" # Create service_area service_area = create_service_area(governorate) if dry_run: return True, f"Would add service_area: {governorate}" # Add service_area after ghcid section # We need to preserve order, so rebuild the dict new_data = {} for key, value in data.items(): new_data[key] = value if key == 'ghcid': new_data['service_area'] = service_area # If ghcid wasn't found, add at end if 'service_area' not in new_data: new_data['service_area'] = service_area # Write back with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(new_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return True, f"Added service_area: {governorate}" def main(): import argparse parser = argparse.ArgumentParser(description='Enrich Palestinian institutions with service_area') parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') parser.add_argument('--file', type=str, help='Process a single file') args = parser.parse_args() # Find data directory script_dir = Path(__file__).parent data_dir = script_dir.parent / 'data' / 'custodian' if args.file: files = [Path(args.file)] else: files = sorted(data_dir.glob('PS-*.yaml')) print(f"Processing {len(files)} Palestinian institution files...") if args.dry_run: print("(DRY RUN - no changes will be made)\n") stats = {'success': 0, 'skipped': 0, 'error': 0} for filepath in files: success, message = enrich_file(filepath, dry_run=args.dry_run) status = 'OK' if success else 'SKIP' print(f" [{status}] {filepath.name}: {message}") if success: stats['success'] += 1 elif 'Already' in message or 'Could not' in message: stats['skipped'] += 1 else: stats['error'] += 1 print(f"\nSummary:") print(f" Enriched: {stats['success']}") print(f" Skipped: {stats['skipped']}") print(f" Errors: {stats['error']}") if __name__ == '__main__': main()