glam/scripts/generate_palestinian_ghcids.py
2025-12-06 19:50:04 +01:00

347 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Generate GHCID persistent identifiers for Palestinian and Lebanese heritage institutions.
This script reads the consolidated Palestinian heritage JSON and generates:
- GHCID string (human-readable)
- UUID v5 (primary persistent identifier)
- UUID v8 (SHA-256 based)
- Numeric hash (64-bit)
Usage:
python scripts/generate_palestinian_ghcids.py [--dry-run]
Output:
Updates data/extracted/palestinian_heritage_consolidated.json with ghcid fields
"""
import json
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, Optional, Tuple
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.identifiers.ghcid import (
GHCIDComponents,
GHCIDGenerator,
InstitutionType,
extract_abbreviation_from_name,
)
# City to region code mappings
# Palestine (PS) - ISO 3166-2:PS
PALESTINE_CITY_REGIONS = {
"ramallah": "RBH", # Ramallah and Al-Bireh
"al-bireh": "RBH", # Ramallah and Al-Bireh
"birzeit": "RBH", # Part of Ramallah governorate
"gaza": "GZA", # Gaza
"hebron": "HBN", # Hebron
"nablus": "NBS", # Nablus
"bethlehem": "BTH", # Bethlehem
"jerusalem": "JEM", # Jerusalem (special status)
"tulkarm": "TKM", # Tulkarm
"jenin": "JEN", # Jenin
"jericho": "JRH", # Jericho
"qalqilya": "QQA", # Qalqilya
"salfit": "SLT", # Salfit
"tubas": "TBS", # Tubas
"khan yunis": "KYS", # Khan Yunis
"rafah": "RFH", # Rafah
}
# Lebanon (LB) - ISO 3166-2:LB
LEBANON_CITY_REGIONS = {
"beirut": "BA", # Bayrut (Beirut)
"tripoli": "AS", # Ash Shimal (North)
"sidon": "JA", # Al Janub (South)
"tyre": "JA", # Al Janub (South)
"nabatieh": "NA", # An Nabatiyah
"zahle": "BI", # Al Biqa (Bekaa)
"baalbek": "BH", # Baalbek-Hermel
}
# US States
US_STATE_REGIONS = {
"washington dc": "DC",
"washington, dc": "DC",
"new york": "NY",
"los angeles": "CA",
"san francisco": "CA",
"chicago": "IL",
"boston": "MA",
}
# Institution type mapping from GLAM-NER types
TYPE_MAPPING = {
# Museums
"museum": InstitutionType.MUSEUM,
"art_museum": InstitutionType.MUSEUM,
"archaeology_museum": InstitutionType.MUSEUM,
"GRP.HER.MUS": InstitutionType.MUSEUM,
# Libraries
"library": InstitutionType.LIBRARY,
"national_library": InstitutionType.LIBRARY,
"public_library": InstitutionType.LIBRARY,
"public_library_system": InstitutionType.LIBRARY,
"academic_library": InstitutionType.LIBRARY,
"GRP.HER.LIB": InstitutionType.LIBRARY,
# Archives
"archive": InstitutionType.ARCHIVE,
"oral_history_archive": InstitutionType.ARCHIVE,
"photographic_archive": InstitutionType.ARCHIVE,
"research_archive": InstitutionType.ARCHIVE,
"municipal_archive": InstitutionType.ARCHIVE,
"institutional_archive": InstitutionType.ARCHIVE,
"refugee_archive": InstitutionType.ARCHIVE,
"cultural_archive": InstitutionType.ARCHIVE,
"community_archive": InstitutionType.ARCHIVE,
"digital_archive": InstitutionType.ARCHIVE,
"family_archives": InstitutionType.ARCHIVE,
"GRP.HER": InstitutionType.ARCHIVE, # Default heritage to archive
# Galleries / Cultural Centers
"gallery": InstitutionType.GALLERY,
"cultural_center": InstitutionType.GALLERY,
"theater": InstitutionType.GALLERY,
"GRP.HER.GAL": InstitutionType.GALLERY,
# Research Centers
"research_institute": InstitutionType.RESEARCH_CENTER,
"heritage_center": InstitutionType.RESEARCH_CENTER,
# Societies/Networks
"archival_network": InstitutionType.COLLECTING_SOCIETY,
"genealogy_project": InstitutionType.COLLECTING_SOCIETY,
# Digital platforms
"digital_platform": InstitutionType.OFFICIAL_INSTITUTION,
"digital_encyclopedia": InstitutionType.OFFICIAL_INSTITUTION,
"GRP.HER.DIG": InstitutionType.OFFICIAL_INSTITUTION,
}
def get_city_code(city: str) -> str:
"""Generate 3-letter city code from city name."""
if not city:
return "XXX"
# Normalize
city_clean = city.lower().strip()
# Special cases
special_codes = {
"jerusalem": "JER",
"ramallah": "RAM",
"bethlehem": "BTH",
"gaza": "GAZ",
"hebron": "HEB",
"nablus": "NAB",
"beirut": "BEI",
"washington dc": "WDC",
"al-bireh": "BIR",
"birzeit": "BIR",
"tulkarm": "TUL",
}
if city_clean in special_codes:
return special_codes[city_clean]
# Default: first 3 letters
city_alpha = ''.join(c for c in city_clean if c.isalpha())
return city_alpha[:3].upper() if len(city_alpha) >= 3 else city_alpha.upper().ljust(3, 'X')
def get_region_code(city: str, country: str) -> str:
"""Get ISO 3166-2 region code for a city."""
if not city:
return "XX"
city_lower = city.lower().strip()
country_upper = (country or "").upper()
if country_upper == "PS":
return PALESTINE_CITY_REGIONS.get(city_lower, "WB") # Default to West Bank
elif country_upper == "LB":
return LEBANON_CITY_REGIONS.get(city_lower, "BA") # Default to Beirut
elif country_upper == "US":
return US_STATE_REGIONS.get(city_lower, "DC")
else:
return "XX"
def get_institution_type(type_str: str, subtype: str) -> InstitutionType:
"""Map GLAM-NER type to GHCID InstitutionType."""
# Try subtype first (more specific)
if subtype and subtype in TYPE_MAPPING:
return TYPE_MAPPING[subtype]
# Then try main type
if type_str and type_str in TYPE_MAPPING:
return TYPE_MAPPING[type_str]
# Default
return InstitutionType.ARCHIVE
def generate_ghcid_for_institution(inst: Dict) -> Optional[Dict]:
"""Generate GHCID data for a single institution."""
name = inst.get("name", "")
if not name:
return None
country = inst.get("country", "PS")
city = inst.get("city", "")
location = inst.get("location", "")
type_str = inst.get("type", "")
subtype = inst.get("subtype", "")
wikidata = inst.get("wikidata", {})
wikidata_id = wikidata.get("id") if isinstance(wikidata, dict) else None
# Skip online-only platforms (no physical location)
if location == "Online" and not city:
return {
"ghcid_status": "skipped",
"ghcid_reason": "Online-only platform without physical location"
}
# Determine components
country_code = country if country else "PS"
region_code = get_region_code(city, country_code)
city_code = get_city_code(city)
inst_type = get_institution_type(type_str, subtype)
# Extract abbreviation from name
abbreviation = extract_abbreviation_from_name(name)
# Handle edge case: empty abbreviation
if not abbreviation:
abbreviation = "INST"
try:
# Create GHCID components
components = GHCIDComponents(
country_code=country_code,
region_code=region_code,
city_locode=city_code,
institution_type=inst_type.value,
abbreviation=abbreviation,
wikidata_qid=wikidata_id.replace("Q", "") if wikidata_id else None
)
# Validate
is_valid, error = components.validate()
if not is_valid:
return {
"ghcid_status": "error",
"ghcid_error": error
}
# Generate all identifier formats
return {
"ghcid": components.to_string(),
"ghcid_uuid": str(components.to_uuid()),
"ghcid_uuid_sha256": str(components.to_uuid_sha256()),
"ghcid_numeric": components.to_numeric(),
"ghcid_components": {
"country": country_code,
"region": region_code,
"city": city_code,
"type": inst_type.value,
"abbreviation": abbreviation,
},
"ghcid_generated": datetime.now(timezone.utc).isoformat(),
}
except Exception as e:
return {
"ghcid_status": "error",
"ghcid_error": str(e)
}
def main():
import argparse
parser = argparse.ArgumentParser(description="Generate GHCIDs for Palestinian heritage institutions")
parser.add_argument("--dry-run", action="store_true", help="Don't save changes")
args = parser.parse_args()
# Load data
data_file = Path(__file__).parent.parent / "data" / "extracted" / "palestinian_heritage_consolidated.json"
if not data_file.exists():
print(f"Error: Data file not found: {data_file}")
return 1
print(f"Loading: {data_file}")
with open(data_file, 'r', encoding='utf-8') as f:
data = json.load(f)
institutions = data.get("institutions", [])
print(f"Processing {len(institutions)} institutions...")
stats = {
"total": len(institutions),
"generated": 0,
"skipped": 0,
"errors": 0,
}
for inst in institutions:
ghcid_data = generate_ghcid_for_institution(inst)
if ghcid_data is None:
stats["errors"] += 1
continue
if ghcid_data.get("ghcid_status") == "skipped":
stats["skipped"] += 1
inst["ghcid_status"] = "skipped"
inst["ghcid_reason"] = ghcid_data.get("ghcid_reason")
print(f" Skipped: {inst.get('name')} - {ghcid_data.get('ghcid_reason')}")
continue
if ghcid_data.get("ghcid_status") == "error":
stats["errors"] += 1
inst["ghcid_status"] = "error"
inst["ghcid_error"] = ghcid_data.get("ghcid_error")
print(f" Error: {inst.get('name')} - {ghcid_data.get('ghcid_error')}")
continue
# Add GHCID data to institution
inst.update(ghcid_data)
stats["generated"] += 1
print(f" {inst.get('name')}: {ghcid_data.get('ghcid')}")
# Update metadata
if not args.dry_run and stats["generated"] > 0:
data["metadata"]["updated"] = datetime.now(timezone.utc).isoformat()
data["metadata"]["version"] = "2.2.0"
data["metadata"]["statistics"]["ghcid_generated"] = stats["generated"]
# Save
with open(data_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"\nSaved: {data_file}")
# Summary
print("\n" + "=" * 60)
print("GHCID GENERATION COMPLETE")
print("=" * 60)
print(f"Total institutions: {stats['total']}")
print(f"GHCIDs generated: {stats['generated']}")
print(f"Skipped (online): {stats['skipped']}")
print(f"Errors: {stats['errors']}")
return 0
if __name__ == "__main__":
sys.exit(main())