glam/scripts/enrich_ps_service_area.py
2025-12-07 23:08:02 +01:00

240 lines
7.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Palestinian heritage institution files with service_area data.
Maps our region codes to GADM admin2 (governorate) names for werkgebied display.
"""
import os
import sys
from pathlib import Path
from datetime import datetime, timezone
import yaml
# Mapping from our region codes to GADM admin2 governorate names
# Our codes are derived from city/region abbreviations
# GADM names are from the boundaries API
REGION_TO_GOVERNORATE = {
# Gaza Strip (admin1: PSE.1_1)
'GZ': 'Gaza', # Gaza City area
'GZA': 'GazaashShamaliyah', # North Gaza (Gaza ash-Shamaliyah)
'KY': 'KhanYunis', # Khan Yunis
'RAF': 'Rafah', # Rafah
'DEB': 'DeirAl-Balah', # Deir al-Balah (central Gaza)
# West Bank (admin1: PSE.2_1)
'BTH': 'Bethlehem', # Bethlehem
'HBN': 'Hebron', # Hebron
'JEM': 'Jerusalem', # Jerusalem (East Jerusalem)
'RBH': 'RamallahandAl-Bireh', # Ramallah and Al-Bireh
'NBS': 'Nablus', # Nablus
'JEN': 'Jenin', # Jenin
'JER': 'Jericho', # Jericho
'TKM': 'Tulkarm', # Tulkarm
'QAL': 'Qalqilya', # Qalqilya
'SAL': 'Salfit', # Salfit
'TUB': 'Tubas', # Tubas
'WE': 'RamallahandAl-Bireh', # West Bank general → default to Ramallah
}
# City to governorate mapping for cases where region code is ambiguous
CITY_TO_GOVERNORATE = {
'Gaza City': 'Gaza',
'Gaza': 'Gaza',
'Khan Yunis': 'KhanYunis',
'Rafah': 'Rafah',
'Deir al-Balah': 'DeirAl-Balah',
'Beit Hanoun': 'GazaashShamaliyah', # North Gaza
'Beit Lahia': 'GazaashShamaliyah', # North Gaza
'Jabalia': 'GazaashShamaliyah', # North Gaza
'Bethlehem': 'Bethlehem',
'Hebron': 'Hebron',
'Jerusalem': 'Jerusalem',
'East Jerusalem': 'Jerusalem',
'Ramallah': 'RamallahandAl-Bireh',
'Al-Bireh': 'RamallahandAl-Bireh',
'Nablus': 'Nablus',
'Jenin': 'Jenin',
'Jericho': 'Jericho',
'Tulkarm': 'Tulkarm',
'Qalqilya': 'Qalqilya',
'Salfit': 'Salfit',
'Tubas': 'Tubas',
}
# Admin1 codes for region grouping
GOVERNORATE_TO_ADMIN1 = {
# Gaza Strip
'Gaza': 'PSE.1_1',
'GazaashShamaliyah': 'PSE.1_1',
'KhanYunis': 'PSE.1_1',
'Rafah': 'PSE.1_1',
'DeirAl-Balah': 'PSE.1_1',
# West Bank
'Bethlehem': 'PSE.2_1',
'Hebron': 'PSE.2_1',
'Jerusalem': 'PSE.2_1',
'RamallahandAl-Bireh': 'PSE.2_1',
'Nablus': 'PSE.2_1',
'Jenin': 'PSE.2_1',
'Jericho': 'PSE.2_1',
'Tulkarm': 'PSE.2_1',
'Qalqilya': 'PSE.2_1',
'Salfit': 'PSE.2_1',
'Tubas': 'PSE.2_1',
}
# Governorate name to database ID (from API /boundaries/countries/PS/admin2)
GOVERNORATE_TO_ID = {
# Gaza Strip
'DeirAl-Balah': 1,
'Gaza': 2,
'GazaashShamaliyah': 3,
'KhanYunis': 4,
'Rafah': 5,
# West Bank
'Bethlehem': 6,
'Hebron': 7,
'Jenin': 8,
'Jericho': 9,
'Jerusalem': 10,
'Nablus': 11,
'Qalqilya': 12,
'RamallahandAl-Bireh': 13,
'Salfit': 14,
'Tubas': 15,
'Tulkarm': 16,
}
def determine_governorate(entry: dict) -> str | None:
"""Determine the governorate name from entry data."""
# Try location_resolution first
loc_res = entry.get('ghcid', {}).get('location_resolution', {})
# 1. Try city_name → governorate mapping
city_name = loc_res.get('city_name')
if city_name and city_name in CITY_TO_GOVERNORATE:
return CITY_TO_GOVERNORATE[city_name]
# 2. Try region_code → governorate mapping
region_code = loc_res.get('region_code')
if region_code and region_code in REGION_TO_GOVERNORATE:
return REGION_TO_GOVERNORATE[region_code]
# 3. Try original_entry city
orig_city = entry.get('original_entry', {}).get('city')
if orig_city and orig_city in CITY_TO_GOVERNORATE:
return CITY_TO_GOVERNORATE[orig_city]
return None
def create_service_area(governorate: str) -> dict:
"""Create a service_area dict for the given governorate."""
admin1_code = GOVERNORATE_TO_ADMIN1.get(governorate)
admin1_name = 'Gaza Strip' if admin1_code == 'PSE.1_1' else 'West Bank'
admin2_id = GOVERNORATE_TO_ID.get(governorate)
return {
'type': 'admin2',
'country_code': 'PS',
'admin1_code': admin1_code,
'admin1_name': admin1_name,
'admin2_id': admin2_id,
'admin2_name': governorate,
'source': 'GADM v4.1',
'api_endpoint': f'http://91.98.224.44:8001/boundaries/admin2/{admin2_id}/geojson',
'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
'notes': f'Heritage institution service area based on location in {governorate} governorate.'
}
def enrich_file(filepath: Path, dry_run: bool = False) -> tuple[bool, str]:
"""Enrich a single file with service_area data.
Returns (success, message).
"""
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if data is None:
return False, "Empty or invalid YAML"
# Check if already has service_area
if 'service_area' in data:
return False, "Already has service_area"
# Determine governorate
governorate = determine_governorate(data)
if not governorate:
return False, "Could not determine governorate"
# Create service_area
service_area = create_service_area(governorate)
if dry_run:
return True, f"Would add service_area: {governorate}"
# Add service_area after ghcid section
# We need to preserve order, so rebuild the dict
new_data = {}
for key, value in data.items():
new_data[key] = value
if key == 'ghcid':
new_data['service_area'] = service_area
# If ghcid wasn't found, add at end
if 'service_area' not in new_data:
new_data['service_area'] = service_area
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(new_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return True, f"Added service_area: {governorate}"
def main():
import argparse
parser = argparse.ArgumentParser(description='Enrich Palestinian institutions with service_area')
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
parser.add_argument('--file', type=str, help='Process a single file')
args = parser.parse_args()
# Find data directory
script_dir = Path(__file__).parent
data_dir = script_dir.parent / 'data' / 'custodian'
if args.file:
files = [Path(args.file)]
else:
files = sorted(data_dir.glob('PS-*.yaml'))
print(f"Processing {len(files)} Palestinian institution files...")
if args.dry_run:
print("(DRY RUN - no changes will be made)\n")
stats = {'success': 0, 'skipped': 0, 'error': 0}
for filepath in files:
success, message = enrich_file(filepath, dry_run=args.dry_run)
status = 'OK' if success else 'SKIP'
print(f" [{status}] {filepath.name}: {message}")
if success:
stats['success'] += 1
elif 'Already' in message or 'Could not' in message:
stats['skipped'] += 1
else:
stats['error'] += 1
print(f"\nSummary:")
print(f" Enriched: {stats['success']}")
print(f" Skipped: {stats['skipped']}")
print(f" Errors: {stats['error']}")
if __name__ == '__main__':
main()