#!/usr/bin/env python3 """ Generate GHCID persistent identifiers for Palestinian and Lebanese heritage institutions. This script reads the consolidated Palestinian heritage JSON and generates: - GHCID string (human-readable) - UUID v5 (primary persistent identifier) - UUID v8 (SHA-256 based) - Numeric hash (64-bit) Usage: python scripts/generate_palestinian_ghcids.py [--dry-run] Output: Updates data/extracted/palestinian_heritage_consolidated.json with ghcid fields """ import json import sys from datetime import datetime, timezone from pathlib import Path from typing import Dict, Optional, Tuple # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.identifiers.ghcid import ( GHCIDComponents, GHCIDGenerator, InstitutionType, extract_abbreviation_from_name, ) # City to region code mappings # Palestine (PS) - ISO 3166-2:PS PALESTINE_CITY_REGIONS = { "ramallah": "RBH", # Ramallah and Al-Bireh "al-bireh": "RBH", # Ramallah and Al-Bireh "birzeit": "RBH", # Part of Ramallah governorate "gaza": "GZA", # Gaza "hebron": "HBN", # Hebron "nablus": "NBS", # Nablus "bethlehem": "BTH", # Bethlehem "jerusalem": "JEM", # Jerusalem (special status) "tulkarm": "TKM", # Tulkarm "jenin": "JEN", # Jenin "jericho": "JRH", # Jericho "qalqilya": "QQA", # Qalqilya "salfit": "SLT", # Salfit "tubas": "TBS", # Tubas "khan yunis": "KYS", # Khan Yunis "rafah": "RFH", # Rafah } # Lebanon (LB) - ISO 3166-2:LB LEBANON_CITY_REGIONS = { "beirut": "BA", # Bayrut (Beirut) "tripoli": "AS", # Ash Shimal (North) "sidon": "JA", # Al Janub (South) "tyre": "JA", # Al Janub (South) "nabatieh": "NA", # An Nabatiyah "zahle": "BI", # Al Biqa (Bekaa) "baalbek": "BH", # Baalbek-Hermel } # US States US_STATE_REGIONS = { "washington dc": "DC", "washington, dc": "DC", "new york": "NY", "los angeles": "CA", "san francisco": "CA", "chicago": "IL", "boston": "MA", } # Institution type mapping from GLAM-NER types TYPE_MAPPING = { # Museums "museum": InstitutionType.MUSEUM, "art_museum": InstitutionType.MUSEUM, "archaeology_museum": InstitutionType.MUSEUM, "GRP.HER.MUS": InstitutionType.MUSEUM, # Libraries "library": InstitutionType.LIBRARY, "national_library": InstitutionType.LIBRARY, "public_library": InstitutionType.LIBRARY, "public_library_system": InstitutionType.LIBRARY, "academic_library": InstitutionType.LIBRARY, "GRP.HER.LIB": InstitutionType.LIBRARY, # Archives "archive": InstitutionType.ARCHIVE, "oral_history_archive": InstitutionType.ARCHIVE, "photographic_archive": InstitutionType.ARCHIVE, "research_archive": InstitutionType.ARCHIVE, "municipal_archive": InstitutionType.ARCHIVE, "institutional_archive": InstitutionType.ARCHIVE, "refugee_archive": InstitutionType.ARCHIVE, "cultural_archive": InstitutionType.ARCHIVE, "community_archive": InstitutionType.ARCHIVE, "digital_archive": InstitutionType.ARCHIVE, "family_archives": InstitutionType.ARCHIVE, "GRP.HER": InstitutionType.ARCHIVE, # Default heritage to archive # Galleries / Cultural Centers "gallery": InstitutionType.GALLERY, "cultural_center": InstitutionType.GALLERY, "theater": InstitutionType.GALLERY, "GRP.HER.GAL": InstitutionType.GALLERY, # Research Centers "research_institute": InstitutionType.RESEARCH_CENTER, "heritage_center": InstitutionType.RESEARCH_CENTER, # Societies/Networks "archival_network": InstitutionType.COLLECTING_SOCIETY, "genealogy_project": InstitutionType.COLLECTING_SOCIETY, # Digital platforms "digital_platform": InstitutionType.OFFICIAL_INSTITUTION, "digital_encyclopedia": InstitutionType.OFFICIAL_INSTITUTION, "GRP.HER.DIG": InstitutionType.OFFICIAL_INSTITUTION, } def get_city_code(city: str) -> str: """Generate 3-letter city code from city name.""" if not city: return "XXX" # Normalize city_clean = city.lower().strip() # Special cases special_codes = { "jerusalem": "JER", "ramallah": "RAM", "bethlehem": "BTH", "gaza": "GAZ", "hebron": "HEB", "nablus": "NAB", "beirut": "BEI", "washington dc": "WDC", "al-bireh": "BIR", "birzeit": "BIR", "tulkarm": "TUL", } if city_clean in special_codes: return special_codes[city_clean] # Default: first 3 letters city_alpha = ''.join(c for c in city_clean if c.isalpha()) return city_alpha[:3].upper() if len(city_alpha) >= 3 else city_alpha.upper().ljust(3, 'X') def get_region_code(city: str, country: str) -> str: """Get ISO 3166-2 region code for a city.""" if not city: return "XX" city_lower = city.lower().strip() country_upper = (country or "").upper() if country_upper == "PS": return PALESTINE_CITY_REGIONS.get(city_lower, "WB") # Default to West Bank elif country_upper == "LB": return LEBANON_CITY_REGIONS.get(city_lower, "BA") # Default to Beirut elif country_upper == "US": return US_STATE_REGIONS.get(city_lower, "DC") else: return "XX" def get_institution_type(type_str: str, subtype: str) -> InstitutionType: """Map GLAM-NER type to GHCID InstitutionType.""" # Try subtype first (more specific) if subtype and subtype in TYPE_MAPPING: return TYPE_MAPPING[subtype] # Then try main type if type_str and type_str in TYPE_MAPPING: return TYPE_MAPPING[type_str] # Default return InstitutionType.ARCHIVE def generate_ghcid_for_institution(inst: Dict) -> Optional[Dict]: """Generate GHCID data for a single institution.""" name = inst.get("name", "") if not name: return None country = inst.get("country", "PS") city = inst.get("city", "") location = inst.get("location", "") type_str = inst.get("type", "") subtype = inst.get("subtype", "") wikidata = inst.get("wikidata", {}) wikidata_id = wikidata.get("id") if isinstance(wikidata, dict) else None # Skip online-only platforms (no physical location) if location == "Online" and not city: return { "ghcid_status": "skipped", "ghcid_reason": "Online-only platform without physical location" } # Determine components country_code = country if country else "PS" region_code = get_region_code(city, country_code) city_code = get_city_code(city) inst_type = get_institution_type(type_str, subtype) # Extract abbreviation from name abbreviation = extract_abbreviation_from_name(name) # Handle edge case: empty abbreviation if not abbreviation: abbreviation = "INST" try: # Create GHCID components components = GHCIDComponents( country_code=country_code, region_code=region_code, city_locode=city_code, institution_type=inst_type.value, abbreviation=abbreviation, wikidata_qid=wikidata_id.replace("Q", "") if wikidata_id else None ) # Validate is_valid, error = components.validate() if not is_valid: return { "ghcid_status": "error", "ghcid_error": error } # Generate all identifier formats return { "ghcid": components.to_string(), "ghcid_uuid": str(components.to_uuid()), "ghcid_uuid_sha256": str(components.to_uuid_sha256()), "ghcid_numeric": components.to_numeric(), "ghcid_components": { "country": country_code, "region": region_code, "city": city_code, "type": inst_type.value, "abbreviation": abbreviation, }, "ghcid_generated": datetime.now(timezone.utc).isoformat(), } except Exception as e: return { "ghcid_status": "error", "ghcid_error": str(e) } def main(): import argparse parser = argparse.ArgumentParser(description="Generate GHCIDs for Palestinian heritage institutions") parser.add_argument("--dry-run", action="store_true", help="Don't save changes") args = parser.parse_args() # Load data data_file = Path(__file__).parent.parent / "data" / "extracted" / "palestinian_heritage_consolidated.json" if not data_file.exists(): print(f"Error: Data file not found: {data_file}") return 1 print(f"Loading: {data_file}") with open(data_file, 'r', encoding='utf-8') as f: data = json.load(f) institutions = data.get("institutions", []) print(f"Processing {len(institutions)} institutions...") stats = { "total": len(institutions), "generated": 0, "skipped": 0, "errors": 0, } for inst in institutions: ghcid_data = generate_ghcid_for_institution(inst) if ghcid_data is None: stats["errors"] += 1 continue if ghcid_data.get("ghcid_status") == "skipped": stats["skipped"] += 1 inst["ghcid_status"] = "skipped" inst["ghcid_reason"] = ghcid_data.get("ghcid_reason") print(f" Skipped: {inst.get('name')} - {ghcid_data.get('ghcid_reason')}") continue if ghcid_data.get("ghcid_status") == "error": stats["errors"] += 1 inst["ghcid_status"] = "error" inst["ghcid_error"] = ghcid_data.get("ghcid_error") print(f" Error: {inst.get('name')} - {ghcid_data.get('ghcid_error')}") continue # Add GHCID data to institution inst.update(ghcid_data) stats["generated"] += 1 print(f" {inst.get('name')}: {ghcid_data.get('ghcid')}") # Update metadata if not args.dry_run and stats["generated"] > 0: data["metadata"]["updated"] = datetime.now(timezone.utc).isoformat() data["metadata"]["version"] = "2.2.0" data["metadata"]["statistics"]["ghcid_generated"] = stats["generated"] # Save with open(data_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) print(f"\nSaved: {data_file}") # Summary print("\n" + "=" * 60) print("GHCID GENERATION COMPLETE") print("=" * 60) print(f"Total institutions: {stats['total']}") print(f"GHCIDs generated: {stats['generated']}") print(f"Skipped (online): {stats['skipped']}") print(f"Errors: {stats['errors']}") return 0 if __name__ == "__main__": sys.exit(main())