240 lines
7.5 KiB
Python
Executable file
240 lines
7.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Palestinian heritage institution files with service_area data.
|
|
|
|
Maps our region codes to GADM admin2 (governorate) names for werkgebied display.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
import yaml
|
|
|
|
# Mapping from our region codes to GADM admin2 governorate names
|
|
# Our codes are derived from city/region abbreviations
|
|
# GADM names are from the boundaries API
|
|
REGION_TO_GOVERNORATE = {
|
|
# Gaza Strip (admin1: PSE.1_1)
|
|
'GZ': 'Gaza', # Gaza City area
|
|
'GZA': 'GazaashShamaliyah', # North Gaza (Gaza ash-Shamaliyah)
|
|
'KY': 'KhanYunis', # Khan Yunis
|
|
'RAF': 'Rafah', # Rafah
|
|
'DEB': 'DeirAl-Balah', # Deir al-Balah (central Gaza)
|
|
|
|
# West Bank (admin1: PSE.2_1)
|
|
'BTH': 'Bethlehem', # Bethlehem
|
|
'HBN': 'Hebron', # Hebron
|
|
'JEM': 'Jerusalem', # Jerusalem (East Jerusalem)
|
|
'RBH': 'RamallahandAl-Bireh', # Ramallah and Al-Bireh
|
|
'NBS': 'Nablus', # Nablus
|
|
'JEN': 'Jenin', # Jenin
|
|
'JER': 'Jericho', # Jericho
|
|
'TKM': 'Tulkarm', # Tulkarm
|
|
'QAL': 'Qalqilya', # Qalqilya
|
|
'SAL': 'Salfit', # Salfit
|
|
'TUB': 'Tubas', # Tubas
|
|
'WE': 'RamallahandAl-Bireh', # West Bank general → default to Ramallah
|
|
}
|
|
|
|
# City to governorate mapping for cases where region code is ambiguous
|
|
CITY_TO_GOVERNORATE = {
|
|
'Gaza City': 'Gaza',
|
|
'Gaza': 'Gaza',
|
|
'Khan Yunis': 'KhanYunis',
|
|
'Rafah': 'Rafah',
|
|
'Deir al-Balah': 'DeirAl-Balah',
|
|
'Beit Hanoun': 'GazaashShamaliyah', # North Gaza
|
|
'Beit Lahia': 'GazaashShamaliyah', # North Gaza
|
|
'Jabalia': 'GazaashShamaliyah', # North Gaza
|
|
'Bethlehem': 'Bethlehem',
|
|
'Hebron': 'Hebron',
|
|
'Jerusalem': 'Jerusalem',
|
|
'East Jerusalem': 'Jerusalem',
|
|
'Ramallah': 'RamallahandAl-Bireh',
|
|
'Al-Bireh': 'RamallahandAl-Bireh',
|
|
'Nablus': 'Nablus',
|
|
'Jenin': 'Jenin',
|
|
'Jericho': 'Jericho',
|
|
'Tulkarm': 'Tulkarm',
|
|
'Qalqilya': 'Qalqilya',
|
|
'Salfit': 'Salfit',
|
|
'Tubas': 'Tubas',
|
|
}
|
|
|
|
# Admin1 codes for region grouping
|
|
GOVERNORATE_TO_ADMIN1 = {
|
|
# Gaza Strip
|
|
'Gaza': 'PSE.1_1',
|
|
'GazaashShamaliyah': 'PSE.1_1',
|
|
'KhanYunis': 'PSE.1_1',
|
|
'Rafah': 'PSE.1_1',
|
|
'DeirAl-Balah': 'PSE.1_1',
|
|
# West Bank
|
|
'Bethlehem': 'PSE.2_1',
|
|
'Hebron': 'PSE.2_1',
|
|
'Jerusalem': 'PSE.2_1',
|
|
'RamallahandAl-Bireh': 'PSE.2_1',
|
|
'Nablus': 'PSE.2_1',
|
|
'Jenin': 'PSE.2_1',
|
|
'Jericho': 'PSE.2_1',
|
|
'Tulkarm': 'PSE.2_1',
|
|
'Qalqilya': 'PSE.2_1',
|
|
'Salfit': 'PSE.2_1',
|
|
'Tubas': 'PSE.2_1',
|
|
}
|
|
|
|
# Governorate name to database ID (from API /boundaries/countries/PS/admin2)
|
|
GOVERNORATE_TO_ID = {
|
|
# Gaza Strip
|
|
'DeirAl-Balah': 1,
|
|
'Gaza': 2,
|
|
'GazaashShamaliyah': 3,
|
|
'KhanYunis': 4,
|
|
'Rafah': 5,
|
|
# West Bank
|
|
'Bethlehem': 6,
|
|
'Hebron': 7,
|
|
'Jenin': 8,
|
|
'Jericho': 9,
|
|
'Jerusalem': 10,
|
|
'Nablus': 11,
|
|
'Qalqilya': 12,
|
|
'RamallahandAl-Bireh': 13,
|
|
'Salfit': 14,
|
|
'Tubas': 15,
|
|
'Tulkarm': 16,
|
|
}
|
|
|
|
|
|
def determine_governorate(entry: dict) -> str | None:
|
|
"""Determine the governorate name from entry data."""
|
|
# Try location_resolution first
|
|
loc_res = entry.get('ghcid', {}).get('location_resolution', {})
|
|
|
|
# 1. Try city_name → governorate mapping
|
|
city_name = loc_res.get('city_name')
|
|
if city_name and city_name in CITY_TO_GOVERNORATE:
|
|
return CITY_TO_GOVERNORATE[city_name]
|
|
|
|
# 2. Try region_code → governorate mapping
|
|
region_code = loc_res.get('region_code')
|
|
if region_code and region_code in REGION_TO_GOVERNORATE:
|
|
return REGION_TO_GOVERNORATE[region_code]
|
|
|
|
# 3. Try original_entry city
|
|
orig_city = entry.get('original_entry', {}).get('city')
|
|
if orig_city and orig_city in CITY_TO_GOVERNORATE:
|
|
return CITY_TO_GOVERNORATE[orig_city]
|
|
|
|
return None
|
|
|
|
|
|
def create_service_area(governorate: str) -> dict:
|
|
"""Create a service_area dict for the given governorate."""
|
|
admin1_code = GOVERNORATE_TO_ADMIN1.get(governorate)
|
|
admin1_name = 'Gaza Strip' if admin1_code == 'PSE.1_1' else 'West Bank'
|
|
admin2_id = GOVERNORATE_TO_ID.get(governorate)
|
|
|
|
return {
|
|
'type': 'admin2',
|
|
'country_code': 'PS',
|
|
'admin1_code': admin1_code,
|
|
'admin1_name': admin1_name,
|
|
'admin2_id': admin2_id,
|
|
'admin2_name': governorate,
|
|
'source': 'GADM v4.1',
|
|
'api_endpoint': f'http://91.98.224.44:8001/boundaries/admin2/{admin2_id}/geojson',
|
|
'enrichment_timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'notes': f'Heritage institution service area based on location in {governorate} governorate.'
|
|
}
|
|
|
|
|
|
def enrich_file(filepath: Path, dry_run: bool = False) -> tuple[bool, str]:
|
|
"""Enrich a single file with service_area data.
|
|
|
|
Returns (success, message).
|
|
"""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
if data is None:
|
|
return False, "Empty or invalid YAML"
|
|
|
|
# Check if already has service_area
|
|
if 'service_area' in data:
|
|
return False, "Already has service_area"
|
|
|
|
# Determine governorate
|
|
governorate = determine_governorate(data)
|
|
if not governorate:
|
|
return False, "Could not determine governorate"
|
|
|
|
# Create service_area
|
|
service_area = create_service_area(governorate)
|
|
|
|
if dry_run:
|
|
return True, f"Would add service_area: {governorate}"
|
|
|
|
# Add service_area after ghcid section
|
|
# We need to preserve order, so rebuild the dict
|
|
new_data = {}
|
|
for key, value in data.items():
|
|
new_data[key] = value
|
|
if key == 'ghcid':
|
|
new_data['service_area'] = service_area
|
|
|
|
# If ghcid wasn't found, add at end
|
|
if 'service_area' not in new_data:
|
|
new_data['service_area'] = service_area
|
|
|
|
# Write back
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(new_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return True, f"Added service_area: {governorate}"
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Enrich Palestinian institutions with service_area')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
|
|
parser.add_argument('--file', type=str, help='Process a single file')
|
|
args = parser.parse_args()
|
|
|
|
# Find data directory
|
|
script_dir = Path(__file__).parent
|
|
data_dir = script_dir.parent / 'data' / 'custodian'
|
|
|
|
if args.file:
|
|
files = [Path(args.file)]
|
|
else:
|
|
files = sorted(data_dir.glob('PS-*.yaml'))
|
|
|
|
print(f"Processing {len(files)} Palestinian institution files...")
|
|
if args.dry_run:
|
|
print("(DRY RUN - no changes will be made)\n")
|
|
|
|
stats = {'success': 0, 'skipped': 0, 'error': 0}
|
|
|
|
for filepath in files:
|
|
success, message = enrich_file(filepath, dry_run=args.dry_run)
|
|
status = 'OK' if success else 'SKIP'
|
|
print(f" [{status}] {filepath.name}: {message}")
|
|
|
|
if success:
|
|
stats['success'] += 1
|
|
elif 'Already' in message or 'Could not' in message:
|
|
stats['skipped'] += 1
|
|
else:
|
|
stats['error'] += 1
|
|
|
|
print(f"\nSummary:")
|
|
print(f" Enriched: {stats['success']}")
|
|
print(f" Skipped: {stats['skipped']}")
|
|
print(f" Errors: {stats['error']}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|