347 lines
11 KiB
Python
347 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate GHCID persistent identifiers for Palestinian and Lebanese heritage institutions.
|
|
|
|
This script reads the consolidated Palestinian heritage JSON and generates:
|
|
- GHCID string (human-readable)
|
|
- UUID v5 (primary persistent identifier)
|
|
- UUID v8 (SHA-256 based)
|
|
- Numeric hash (64-bit)
|
|
|
|
Usage:
|
|
python scripts/generate_palestinian_ghcids.py [--dry-run]
|
|
|
|
Output:
|
|
Updates data/extracted/palestinian_heritage_consolidated.json with ghcid fields
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, Optional, Tuple
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from glam_extractor.identifiers.ghcid import (
|
|
GHCIDComponents,
|
|
GHCIDGenerator,
|
|
InstitutionType,
|
|
extract_abbreviation_from_name,
|
|
)
|
|
|
|
|
|
# City to region code mappings
|
|
# Palestine (PS) - ISO 3166-2:PS
|
|
PALESTINE_CITY_REGIONS = {
|
|
"ramallah": "RBH", # Ramallah and Al-Bireh
|
|
"al-bireh": "RBH", # Ramallah and Al-Bireh
|
|
"birzeit": "RBH", # Part of Ramallah governorate
|
|
"gaza": "GZA", # Gaza
|
|
"hebron": "HBN", # Hebron
|
|
"nablus": "NBS", # Nablus
|
|
"bethlehem": "BTH", # Bethlehem
|
|
"jerusalem": "JEM", # Jerusalem (special status)
|
|
"tulkarm": "TKM", # Tulkarm
|
|
"jenin": "JEN", # Jenin
|
|
"jericho": "JRH", # Jericho
|
|
"qalqilya": "QQA", # Qalqilya
|
|
"salfit": "SLT", # Salfit
|
|
"tubas": "TBS", # Tubas
|
|
"khan yunis": "KYS", # Khan Yunis
|
|
"rafah": "RFH", # Rafah
|
|
}
|
|
|
|
# Lebanon (LB) - ISO 3166-2:LB
|
|
LEBANON_CITY_REGIONS = {
|
|
"beirut": "BA", # Bayrut (Beirut)
|
|
"tripoli": "AS", # Ash Shimal (North)
|
|
"sidon": "JA", # Al Janub (South)
|
|
"tyre": "JA", # Al Janub (South)
|
|
"nabatieh": "NA", # An Nabatiyah
|
|
"zahle": "BI", # Al Biqa (Bekaa)
|
|
"baalbek": "BH", # Baalbek-Hermel
|
|
}
|
|
|
|
# US States
|
|
US_STATE_REGIONS = {
|
|
"washington dc": "DC",
|
|
"washington, dc": "DC",
|
|
"new york": "NY",
|
|
"los angeles": "CA",
|
|
"san francisco": "CA",
|
|
"chicago": "IL",
|
|
"boston": "MA",
|
|
}
|
|
|
|
# Institution type mapping from GLAM-NER types
|
|
TYPE_MAPPING = {
|
|
# Museums
|
|
"museum": InstitutionType.MUSEUM,
|
|
"art_museum": InstitutionType.MUSEUM,
|
|
"archaeology_museum": InstitutionType.MUSEUM,
|
|
"GRP.HER.MUS": InstitutionType.MUSEUM,
|
|
|
|
# Libraries
|
|
"library": InstitutionType.LIBRARY,
|
|
"national_library": InstitutionType.LIBRARY,
|
|
"public_library": InstitutionType.LIBRARY,
|
|
"public_library_system": InstitutionType.LIBRARY,
|
|
"academic_library": InstitutionType.LIBRARY,
|
|
"GRP.HER.LIB": InstitutionType.LIBRARY,
|
|
|
|
# Archives
|
|
"archive": InstitutionType.ARCHIVE,
|
|
"oral_history_archive": InstitutionType.ARCHIVE,
|
|
"photographic_archive": InstitutionType.ARCHIVE,
|
|
"research_archive": InstitutionType.ARCHIVE,
|
|
"municipal_archive": InstitutionType.ARCHIVE,
|
|
"institutional_archive": InstitutionType.ARCHIVE,
|
|
"refugee_archive": InstitutionType.ARCHIVE,
|
|
"cultural_archive": InstitutionType.ARCHIVE,
|
|
"community_archive": InstitutionType.ARCHIVE,
|
|
"digital_archive": InstitutionType.ARCHIVE,
|
|
"family_archives": InstitutionType.ARCHIVE,
|
|
"GRP.HER": InstitutionType.ARCHIVE, # Default heritage to archive
|
|
|
|
# Galleries / Cultural Centers
|
|
"gallery": InstitutionType.GALLERY,
|
|
"cultural_center": InstitutionType.GALLERY,
|
|
"theater": InstitutionType.GALLERY,
|
|
"GRP.HER.GAL": InstitutionType.GALLERY,
|
|
|
|
# Research Centers
|
|
"research_institute": InstitutionType.RESEARCH_CENTER,
|
|
"heritage_center": InstitutionType.RESEARCH_CENTER,
|
|
|
|
# Societies/Networks
|
|
"archival_network": InstitutionType.COLLECTING_SOCIETY,
|
|
"genealogy_project": InstitutionType.COLLECTING_SOCIETY,
|
|
|
|
# Digital platforms
|
|
"digital_platform": InstitutionType.OFFICIAL_INSTITUTION,
|
|
"digital_encyclopedia": InstitutionType.OFFICIAL_INSTITUTION,
|
|
"GRP.HER.DIG": InstitutionType.OFFICIAL_INSTITUTION,
|
|
}
|
|
|
|
|
|
def get_city_code(city: str) -> str:
|
|
"""Generate 3-letter city code from city name."""
|
|
if not city:
|
|
return "XXX"
|
|
|
|
# Normalize
|
|
city_clean = city.lower().strip()
|
|
|
|
# Special cases
|
|
special_codes = {
|
|
"jerusalem": "JER",
|
|
"ramallah": "RAM",
|
|
"bethlehem": "BTH",
|
|
"gaza": "GAZ",
|
|
"hebron": "HEB",
|
|
"nablus": "NAB",
|
|
"beirut": "BEI",
|
|
"washington dc": "WDC",
|
|
"al-bireh": "BIR",
|
|
"birzeit": "BIR",
|
|
"tulkarm": "TUL",
|
|
}
|
|
|
|
if city_clean in special_codes:
|
|
return special_codes[city_clean]
|
|
|
|
# Default: first 3 letters
|
|
city_alpha = ''.join(c for c in city_clean if c.isalpha())
|
|
return city_alpha[:3].upper() if len(city_alpha) >= 3 else city_alpha.upper().ljust(3, 'X')
|
|
|
|
|
|
def get_region_code(city: str, country: str) -> str:
|
|
"""Get ISO 3166-2 region code for a city."""
|
|
if not city:
|
|
return "XX"
|
|
|
|
city_lower = city.lower().strip()
|
|
country_upper = (country or "").upper()
|
|
|
|
if country_upper == "PS":
|
|
return PALESTINE_CITY_REGIONS.get(city_lower, "WB") # Default to West Bank
|
|
elif country_upper == "LB":
|
|
return LEBANON_CITY_REGIONS.get(city_lower, "BA") # Default to Beirut
|
|
elif country_upper == "US":
|
|
return US_STATE_REGIONS.get(city_lower, "DC")
|
|
else:
|
|
return "XX"
|
|
|
|
|
|
def get_institution_type(type_str: str, subtype: str) -> InstitutionType:
|
|
"""Map GLAM-NER type to GHCID InstitutionType."""
|
|
# Try subtype first (more specific)
|
|
if subtype and subtype in TYPE_MAPPING:
|
|
return TYPE_MAPPING[subtype]
|
|
|
|
# Then try main type
|
|
if type_str and type_str in TYPE_MAPPING:
|
|
return TYPE_MAPPING[type_str]
|
|
|
|
# Default
|
|
return InstitutionType.ARCHIVE
|
|
|
|
|
|
def generate_ghcid_for_institution(inst: Dict) -> Optional[Dict]:
|
|
"""Generate GHCID data for a single institution."""
|
|
name = inst.get("name", "")
|
|
if not name:
|
|
return None
|
|
|
|
country = inst.get("country", "PS")
|
|
city = inst.get("city", "")
|
|
location = inst.get("location", "")
|
|
type_str = inst.get("type", "")
|
|
subtype = inst.get("subtype", "")
|
|
wikidata = inst.get("wikidata", {})
|
|
wikidata_id = wikidata.get("id") if isinstance(wikidata, dict) else None
|
|
|
|
# Skip online-only platforms (no physical location)
|
|
if location == "Online" and not city:
|
|
return {
|
|
"ghcid_status": "skipped",
|
|
"ghcid_reason": "Online-only platform without physical location"
|
|
}
|
|
|
|
# Determine components
|
|
country_code = country if country else "PS"
|
|
region_code = get_region_code(city, country_code)
|
|
city_code = get_city_code(city)
|
|
inst_type = get_institution_type(type_str, subtype)
|
|
|
|
# Extract abbreviation from name
|
|
abbreviation = extract_abbreviation_from_name(name)
|
|
|
|
# Handle edge case: empty abbreviation
|
|
if not abbreviation:
|
|
abbreviation = "INST"
|
|
|
|
try:
|
|
# Create GHCID components
|
|
components = GHCIDComponents(
|
|
country_code=country_code,
|
|
region_code=region_code,
|
|
city_locode=city_code,
|
|
institution_type=inst_type.value,
|
|
abbreviation=abbreviation,
|
|
wikidata_qid=wikidata_id.replace("Q", "") if wikidata_id else None
|
|
)
|
|
|
|
# Validate
|
|
is_valid, error = components.validate()
|
|
if not is_valid:
|
|
return {
|
|
"ghcid_status": "error",
|
|
"ghcid_error": error
|
|
}
|
|
|
|
# Generate all identifier formats
|
|
return {
|
|
"ghcid": components.to_string(),
|
|
"ghcid_uuid": str(components.to_uuid()),
|
|
"ghcid_uuid_sha256": str(components.to_uuid_sha256()),
|
|
"ghcid_numeric": components.to_numeric(),
|
|
"ghcid_components": {
|
|
"country": country_code,
|
|
"region": region_code,
|
|
"city": city_code,
|
|
"type": inst_type.value,
|
|
"abbreviation": abbreviation,
|
|
},
|
|
"ghcid_generated": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"ghcid_status": "error",
|
|
"ghcid_error": str(e)
|
|
}
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Generate GHCIDs for Palestinian heritage institutions")
|
|
parser.add_argument("--dry-run", action="store_true", help="Don't save changes")
|
|
args = parser.parse_args()
|
|
|
|
# Load data
|
|
data_file = Path(__file__).parent.parent / "data" / "extracted" / "palestinian_heritage_consolidated.json"
|
|
|
|
if not data_file.exists():
|
|
print(f"Error: Data file not found: {data_file}")
|
|
return 1
|
|
|
|
print(f"Loading: {data_file}")
|
|
with open(data_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
institutions = data.get("institutions", [])
|
|
print(f"Processing {len(institutions)} institutions...")
|
|
|
|
stats = {
|
|
"total": len(institutions),
|
|
"generated": 0,
|
|
"skipped": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
for inst in institutions:
|
|
ghcid_data = generate_ghcid_for_institution(inst)
|
|
|
|
if ghcid_data is None:
|
|
stats["errors"] += 1
|
|
continue
|
|
|
|
if ghcid_data.get("ghcid_status") == "skipped":
|
|
stats["skipped"] += 1
|
|
inst["ghcid_status"] = "skipped"
|
|
inst["ghcid_reason"] = ghcid_data.get("ghcid_reason")
|
|
print(f" Skipped: {inst.get('name')} - {ghcid_data.get('ghcid_reason')}")
|
|
continue
|
|
|
|
if ghcid_data.get("ghcid_status") == "error":
|
|
stats["errors"] += 1
|
|
inst["ghcid_status"] = "error"
|
|
inst["ghcid_error"] = ghcid_data.get("ghcid_error")
|
|
print(f" Error: {inst.get('name')} - {ghcid_data.get('ghcid_error')}")
|
|
continue
|
|
|
|
# Add GHCID data to institution
|
|
inst.update(ghcid_data)
|
|
stats["generated"] += 1
|
|
|
|
print(f" {inst.get('name')}: {ghcid_data.get('ghcid')}")
|
|
|
|
# Update metadata
|
|
if not args.dry_run and stats["generated"] > 0:
|
|
data["metadata"]["updated"] = datetime.now(timezone.utc).isoformat()
|
|
data["metadata"]["version"] = "2.2.0"
|
|
data["metadata"]["statistics"]["ghcid_generated"] = stats["generated"]
|
|
|
|
# Save
|
|
with open(data_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
print(f"\nSaved: {data_file}")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("GHCID GENERATION COMPLETE")
|
|
print("=" * 60)
|
|
print(f"Total institutions: {stats['total']}")
|
|
print(f"GHCIDs generated: {stats['generated']}")
|
|
print(f"Skipped (online): {stats['skipped']}")
|
|
print(f"Errors: {stats['errors']}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|