#!/usr/bin/env python3 """ Generate GHCIDs for Egyptian GLAM institutions. This script: 1. Loads the Egyptian institutions YAML file (with Wikidata/VIAF enrichment) 2. Maps governorate names to ISO 3166-2 codes (EG-C, EG-ALX, etc.) 3. Handles sparse location data: - Extracts cities from street addresses - Infers Cairo for national institutions - Uses coordinates for geocoding 4. Generates GHCID identifiers with four-identifier strategy 5. Updates the YAML file with GHCID fields 6. Detects collisions and appends Wikidata Q-numbers when available Key Challenges for Egypt: - 15/29 institutions have NO location data (empty locations array) - Only 10 institutions have city names - Some cities are actually street names ("Nile Corniche", "Tahrir Square") - National institutions often don't specify Cairo explicitly Solution Strategy: - Parse street addresses to extract city names (Alexandria from "Chatby, Alexandria") - Default national libraries/museums/archives to Cairo (EG-C) - Use Wikidata location data as fallback - Allow "00-XXX" for institutions with unknown precise location Usage: python scripts/generate_ghcids_egypt.py """ import json import re import sys import yaml from collections import Counter, defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Set, Tuple # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.identifiers.ghcid import ( GHCIDGenerator, GHCIDComponents, InstitutionType, extract_abbreviation_from_name, ) from glam_extractor.geocoding.geonames_lookup import GeoNamesDB class EgyptRegionMapper: """Maps Egyptian governorate names to ISO 3166-2 codes.""" def __init__(self): """Load ISO 3166-2 mappings from reference data.""" self.reference_dir = Path(__file__).parent.parent / "data" / "reference" # Load Egypt mapping self.eg_mapping = self._load_mapping("iso_3166_2_eg.json", reverse=True) # Egyptian city -> governorate inference # Some cities are well-known and we can infer the governorate self.city_to_governorate = { 'CAIRO': 'C', 'ALEXANDRIA': 'ALX', 'GIZA': 'GZ', 'LUXOR': 'LX', 'ASWAN': 'ASN', 'PORT SAID': 'PTS', 'SUEZ': 'SUZ', } def _load_mapping(self, filename: str, reverse: bool = False) -> Dict[str, str]: """ Load ISO 3166-2 mapping from JSON file. Args: filename: JSON file in data/reference/ reverse: If True, create name->code mapping (default is code->name) """ filepath = self.reference_dir / filename with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) if reverse: # Create normalized name -> code mapping mapping = {} for code, name in data.items(): # Normalize: uppercase, strip accents for lookup normalized_name = self._normalize_name(name) mapping[normalized_name] = code return mapping return data @staticmethod def _normalize_name(name: str) -> str: """ Normalize governorate/city name for lookup. - Uppercase - Remove accents - Strip whitespace """ import unicodedata # Uppercase normalized = name.upper() # Remove accents (NFD decomposition) normalized = unicodedata.normalize('NFD', normalized) normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Strip whitespace normalized = normalized.strip() return normalized def get_governorate_code(self, governorate_name: str) -> str: """ Get ISO 3166-2 governorate code. Args: governorate_name: Governorate name (e.g., "Cairo", "Alexandria") Returns: ISO 3166-2 subdivision code (e.g., "C", "ALX") Returns "00" if governorate not found (national-level fallback) """ normalized = self._normalize_name(governorate_name) if normalized in self.eg_mapping: return self.eg_mapping[normalized] # Check city->governorate inference if normalized in self.city_to_governorate: return self.city_to_governorate[normalized] # Fallback: return "00" for national-level return "00" class EgyptLocationInference: """Infer location data from various sources.""" # National institutions keywords (likely in Cairo) NATIONAL_KEYWORDS = [ 'national', 'egyptian', 'egypt', 'dar al-kutub', 'dar al-mahfuzat', 'grand egyptian museum', 'egyptian museum cairo' ] # City extraction patterns from addresses CITY_PATTERNS = [ r',\s*([A-Za-z\s]+)\s+\d{4,}', # ", Alexandria 21526" r',\s*([A-Za-z\s]+),\s*Egypt', # ", Cairo, Egypt" r'\b([A-Za-z\s]+)\s+\d{4,}\s*,?\s*Egypt', # "Alexandria 21526, Egypt" ] @classmethod def infer_location(cls, record: dict) -> Tuple[Optional[str], Optional[str]]: """ Infer city and governorate from institution record. Args: record: Institution record (dict) Returns: Tuple of (city_name, governorate_name) - may be None """ locations = record.get('locations', []) name = record.get('name', '').lower() # Strategy 1: Use existing location data if locations: location = locations[0] city = location.get('city') # Check if city is actually a street/landmark if city and not cls._is_landmark(city): # Infer governorate from city governorate = cls._infer_governorate_from_city(city) return city, governorate # Try extracting from street address address = location.get('street_address', '') if address: extracted_city = cls._extract_city_from_address(address) if extracted_city: governorate = cls._infer_governorate_from_city(extracted_city) return extracted_city, governorate # Strategy 2: Infer Cairo for national institutions if any(keyword in name for keyword in cls.NATIONAL_KEYWORDS): return 'Cairo', 'Cairo' # Strategy 3: No location data return None, None @staticmethod def _is_landmark(city_name: str) -> bool: """Check if 'city' is actually a landmark/street.""" landmarks = [ 'nile corniche', 'tahrir square', 'chatby', 'downtown', 'zamalek', 'garden city' ] return city_name.lower() in landmarks @classmethod def _extract_city_from_address(cls, address: str) -> Optional[str]: """Extract city name from street address.""" for pattern in cls.CITY_PATTERNS: match = re.search(pattern, address, re.IGNORECASE) if match: city = match.group(1).strip() # Filter out postal codes, Egypt if city.lower() not in ['egypt', 'eg'] and not city.isdigit(): return city return None @staticmethod def _infer_governorate_from_city(city_name: str) -> Optional[str]: """Map city name to governorate.""" city_upper = city_name.upper() # Major cities that match governorate names major_cities = { 'CAIRO': 'Cairo', 'ALEXANDRIA': 'Alexandria', 'GIZA': 'Giza', 'LUXOR': 'Luxor', 'ASWAN': 'Aswan', 'PORT SAID': 'Port Said', 'SUEZ': 'Suez', } for city_key, governorate in major_cities.items(): if city_key in city_upper: return governorate return None class EgyptGHCIDGenerator: """Generate GHCIDs for Egyptian institutions.""" def __init__(self): """Initialize generator with dependencies.""" self.ghcid_gen = GHCIDGenerator() self.region_mapper = EgyptRegionMapper() self.geonames_db = GeoNamesDB() # Statistics self.stats = { 'total_institutions': 0, 'ghcids_generated': 0, 'location_inferred': 0, 'defaulted_to_cairo': 0, 'missing_city_code': 0, 'missing_governorate_code': 0, 'collisions_detected': 0, 'errors': [], } # Collision detection self.ghcid_usage: Dict[str, List[str]] = defaultdict(list) # GHCID -> [institution_names] @staticmethod def _get_city_code_fallback(city_name: str) -> str: """ Generate 3-letter city code from city name. Args: city_name: City name (e.g., "Cairo", "Alexandria") Returns: 3-letter uppercase code (e.g., "CAI", "ALE") """ import unicodedata # Remove accents normalized = unicodedata.normalize('NFD', city_name) normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Split into words words = normalized.split() if len(words) == 1: # Single word: take first 3 letters code = words[0][:3].upper() else: # Multi-word: take first letter of each word (up to 3) code = ''.join(w[0] for w in words[:3]).upper() # Ensure exactly 3 letters if len(code) < 3: code = code.ljust(3, 'X') elif len(code) > 3: code = code[:3] return code def generate_for_institution(self, record: dict) -> Optional[GHCIDComponents]: """ Generate GHCID for a single Egyptian institution. Args: record: Institution record from YAML (dict) Returns: GHCIDComponents if successful, None otherwise """ self.stats['total_institutions'] += 1 try: # Extract required fields name = record.get('name') institution_type_str = record.get('institution_type', 'UNKNOWN') if not name: self.stats['errors'].append(f"Missing name for record: {record.get('id')}") return None # Country code (always EG) country_code = "EG" # Infer location data city_name, governorate_name = EgyptLocationInference.infer_location(record) if not city_name: # Default to Cairo for national institutions if any(keyword in name.lower() for keyword in EgyptLocationInference.NATIONAL_KEYWORDS): city_name = "Cairo" governorate_name = "Cairo" self.stats['defaulted_to_cairo'] += 1 else: self.stats['errors'].append(f"No location data for: {name}") return None else: if governorate_name: self.stats['location_inferred'] += 1 # Get governorate code (ISO 3166-2) governorate_code = "00" # Default to national-level if governorate_name: governorate_code = self.region_mapper.get_governorate_code(governorate_name) if governorate_code == "00": self.stats['missing_governorate_code'] += 1 # Get city code from GeoNames city_code = "XXX" # Default for unknown if city_name: city_info = self.geonames_db.lookup_city(city_name, country_code) if city_info: city_code = city_info.get_abbreviation() else: self.stats['missing_city_code'] += 1 # Fallback: use first 3 letters of city name city_code = self._get_city_code_fallback(city_name) # Map institution type to GHCID type code try: inst_type = InstitutionType[institution_type_str] except KeyError: inst_type = InstitutionType.UNKNOWN # Generate abbreviation from name abbreviation = extract_abbreviation_from_name(name) # Create GHCID components components = GHCIDComponents( country_code=country_code, region_code=governorate_code, city_locode=city_code, institution_type=inst_type.value, abbreviation=abbreviation, ) # Validate is_valid, error_msg = components.validate() if not is_valid: self.stats['errors'].append(f"Invalid GHCID for {name}: {error_msg}") return None # Check for collisions (before Q-number) base_ghcid = components.to_string() self.ghcid_usage[base_ghcid].append(name) if len(self.ghcid_usage[base_ghcid]) > 1: self.stats['collisions_detected'] += 1 self.stats['ghcids_generated'] += 1 return components except Exception as e: self.stats['errors'].append(f"Error generating GHCID for {record.get('name', 'unknown')}: {e}") return None def process_all_institutions(self, input_file: Path) -> List[dict]: """ Process all institutions in YAML file and generate GHCIDs. Args: input_file: Path to Egyptian institutions YAML file Returns: List of updated institution records with GHCID fields """ print(f"Loading Egyptian institutions from: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"Found {len(institutions)} institutions") print() updated_institutions = [] for i, record in enumerate(institutions, 1): print(f"Processing {i}/{len(institutions)}: {record.get('name', 'unknown')}") # Generate GHCID ghcid_components = self.generate_for_institution(record) if ghcid_components: # Check for Wikidata Q-number (for collision resolution) wikidata_qid = None identifiers = record.get('identifiers', []) for identifier in identifiers: if identifier.get('identifier_scheme') == 'Wikidata': wikidata_qid = identifier.get('identifier_value') break # If collision exists and we have Q-number, append it base_ghcid = ghcid_components.to_string() if len(self.ghcid_usage[base_ghcid]) > 1 and wikidata_qid: # Append Q-number for disambiguation ghcid_with_q = f"{base_ghcid}-{wikidata_qid}" record['ghcid'] = ghcid_with_q print(f" → Collision detected, using GHCID with Q-number: {ghcid_with_q}") else: record['ghcid'] = base_ghcid print(f" → GHCID: {base_ghcid}") # Add UUID v5 (SHA-1) - PRIMARY identifier record['ghcid_uuid'] = str(ghcid_components.to_uuid()) # Add UUID v8 (SHA-256) - Secondary identifier record['ghcid_uuid_sha256'] = str(ghcid_components.to_uuid_sha256()) # Add numeric identifier record['ghcid_numeric'] = ghcid_components.to_numeric() # Add GHCID to identifiers list has_ghcid = any(i.get('identifier_scheme') == 'GHCID' for i in identifiers) if not has_ghcid: identifiers.append({ 'identifier_scheme': 'GHCID', 'identifier_value': record['ghcid'], }) record['identifiers'] = identifiers # Update provenance with GHCID generation metadata provenance = record.get('provenance', {}) provenance['ghcid_generation'] = { 'generated_date': datetime.now(timezone.utc).isoformat(), 'generation_method': 'EgyptGHCIDGenerator with location inference', 'base_ghcid': base_ghcid, 'has_wikidata_disambiguation': wikidata_qid is not None, } record['provenance'] = provenance updated_institutions.append(record) return updated_institutions def print_statistics(self): """Print generation statistics.""" print() print("=" * 70) print("EGYPT GHCID GENERATION STATISTICS") print("=" * 70) print(f"Total institutions processed: {self.stats['total_institutions']}") print(f"GHCIDs successfully generated: {self.stats['ghcids_generated']}") print(f"Locations inferred from data: {self.stats['location_inferred']}") print(f"Defaulted to Cairo (national inst): {self.stats['defaulted_to_cairo']}") print(f"Missing city codes (used fallback): {self.stats['missing_city_code']}") print(f"Missing governorate codes ('00'): {self.stats['missing_governorate_code']}") print(f"GHCID collisions detected: {self.stats['collisions_detected']}") print() if self.stats['errors']: print(f"⚠️ Errors encountered: {len(self.stats['errors'])}") print() print("Error details:") for error in self.stats['errors']: print(f" - {error}") else: print("✅ No errors!") print() # Show collisions if self.stats['collisions_detected'] > 0: print("⚠️ GHCID COLLISIONS DETECTED:") print() for ghcid, names in self.ghcid_usage.items(): if len(names) > 1: print(f" {ghcid}:") for name in names: print(f" - {name}") print() print("Note: Collisions resolved with Wikidata Q-numbers where available") else: print("✅ No GHCID collisions detected!") print() def validate_ghcids(self, institutions: List[dict]): """ Validate all generated GHCIDs. Args: institutions: List of institution records """ print("=" * 70) print("VALIDATION") print("=" * 70) ghcid_set = set() numeric_set = set() uuid_v5_set = set() uuid_v8_set = set() duplicates = [] for record in institutions: ghcid = record.get('ghcid') ghcid_numeric = record.get('ghcid_numeric') ghcid_uuid = record.get('ghcid_uuid') ghcid_uuid_sha256 = record.get('ghcid_uuid_sha256') if ghcid: if ghcid in ghcid_set: duplicates.append(ghcid) ghcid_set.add(ghcid) if ghcid_numeric: numeric_set.add(ghcid_numeric) if ghcid_uuid: uuid_v5_set.add(ghcid_uuid) if ghcid_uuid_sha256: uuid_v8_set.add(ghcid_uuid_sha256) print(f"Unique GHCIDs (with Q-numbers): {len(ghcid_set)}") print(f"Unique numeric GHCIDs: {len(numeric_set)}") print(f"Unique UUID v5 (SHA-1) identifiers: {len(uuid_v5_set)}") print(f"Unique UUID v8 (SHA-256) ids: {len(uuid_v8_set)}") if duplicates: print(f"⚠️ Duplicate GHCIDs found: {len(duplicates)}") for dup in duplicates: print(f" - {dup}") else: print("✅ All GHCIDs are unique!") print() def main(): """Main entry point.""" # Paths project_root = Path(__file__).parent.parent input_file = project_root / "data" / "instances" / "egypt_institutions_wikidata_viaf.yaml" output_file = project_root / "data" / "instances" / "egypt_institutions_ghcid.yaml" backup_file = project_root / "data" / "instances" / "archive" / f"egypt_institutions_pre_ghcid_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml" # Create backup print(f"Creating backup: {backup_file}") backup_file.parent.mkdir(parents=True, exist_ok=True) import shutil shutil.copy(input_file, backup_file) print() # Generate GHCIDs generator = EgyptGHCIDGenerator() updated_institutions = generator.process_all_institutions(input_file) # Print statistics generator.print_statistics() # Validate generator.validate_ghcids(updated_institutions) # Write updated YAML print("=" * 70) print(f"Writing updated YAML to: {output_file}") # Add header comment header = f"""--- # Egyptian GLAM Institutions - GHCID Enhanced # Last updated: {datetime.now(timezone.utc).isoformat()} # GHCID generation: {generator.stats['ghcids_generated']}/{generator.stats['total_institutions']} institutions # # GHCID Statistics: # - Total institutions: {generator.stats['total_institutions']} # - GHCIDs generated: {generator.stats['ghcids_generated']} # - Locations inferred: {generator.stats['location_inferred']} # - Defaulted to Cairo: {generator.stats['defaulted_to_cairo']} # - Missing city codes: {generator.stats['missing_city_code']} # - Missing governorate codes: {generator.stats['missing_governorate_code']} # - Collisions detected: {generator.stats['collisions_detected']} # # Four-Identifier Strategy: # - ghcid: Base GHCID string (with Q-number for collisions) # - ghcid_uuid: UUID v5 (SHA-1) - PRIMARY persistent identifier # - ghcid_uuid_sha256: UUID v8 (SHA-256) - Secondary identifier # - ghcid_numeric: 64-bit numeric for CSV exports # # Location Inference: # - Extracted cities from street addresses (e.g., "Chatby, Alexandria") # - Defaulted national institutions to Cairo # - Used fallback city codes when GeoNames lookup failed """ with open(output_file, 'w', encoding='utf-8') as f: f.write(header) yaml.dump(updated_institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100) print(f"✅ Done! Updated {len(updated_institutions)} institutions") print(f"✅ Output file: {output_file}") print() if __name__ == "__main__": main()