glam/scripts/generate_ghcids_egypt.py

#!/usr/bin/env python3
"""
Generate GHCIDs for Egyptian GLAM institutions.

This script:
1. Loads the Egyptian institutions YAML file (with Wikidata/VIAF enrichment)
2. Maps governorate names to ISO 3166-2 codes (EG-C, EG-ALX, etc.)
3. Handles sparse location data:
   - Extracts cities from street addresses
   - Infers Cairo for national institutions
   - Uses coordinates for geocoding
4. Generates GHCID identifiers with four-identifier strategy
5. Updates the YAML file with GHCID fields
6. Detects collisions and appends Wikidata Q-numbers when available

Key Challenges for Egypt:
- 15/29 institutions have NO location data (empty locations array)
- Only 10 institutions have city names
- Some cities are actually street names ("Nile Corniche", "Tahrir Square")
- National institutions often don't specify Cairo explicitly

Solution Strategy:
- Parse street addresses to extract city names (Alexandria from "Chatby, Alexandria")
- Default national libraries/museums/archives to Cairo (EG-C)
- Use Wikidata location data as fallback
- Allow "00-XXX" for institutions with unknown precise location

Usage:
    python scripts/generate_ghcids_egypt.py
"""

import json
import re
import sys
import yaml
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from glam_extractor.identifiers.ghcid import (
    GHCIDGenerator,
    GHCIDComponents,
    InstitutionType,
    extract_abbreviation_from_name,
)
from glam_extractor.geocoding.geonames_lookup import GeoNamesDB


class EgyptRegionMapper:
    """Maps Egyptian governorate names to ISO 3166-2 codes."""

    def __init__(self):
        """Load ISO 3166-2 mappings from reference data."""
        self.reference_dir = Path(__file__).parent.parent / "data" / "reference"

        # Load Egypt mapping
        self.eg_mapping = self._load_mapping("iso_3166_2_eg.json", reverse=True)

        # Egyptian city -> governorate inference
        # Some cities are well-known and we can infer the governorate
        self.city_to_governorate = {
            'CAIRO': 'C',
            'ALEXANDRIA': 'ALX',
            'GIZA': 'GZ',
            'LUXOR': 'LX',
            'ASWAN': 'ASN',
            'PORT SAID': 'PTS',
            'SUEZ': 'SUZ',
        }

    def _load_mapping(self, filename: str, reverse: bool = False) -> Dict[str, str]:
        """
        Load ISO 3166-2 mapping from JSON file.

        Args:
            filename: JSON file in data/reference/
            reverse: If True, create name->code mapping (default is code->name)
        """
        filepath = self.reference_dir / filename
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)

        if reverse:
            # Create normalized name -> code mapping
            mapping = {}
            for code, name in data.items():
                # Normalize: uppercase, strip accents for lookup
                normalized_name = self._normalize_name(name)
                mapping[normalized_name] = code
            return mapping
        return data

    @staticmethod
    def _normalize_name(name: str) -> str:
        """
        Normalize governorate/city name for lookup.

        - Uppercase
        - Remove accents
        - Strip whitespace
        """
        import unicodedata

        # Uppercase
        normalized = name.upper()

        # Remove accents (NFD decomposition)
        normalized = unicodedata.normalize('NFD', normalized)
        normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

        # Strip whitespace
        normalized = normalized.strip()

        return normalized

    def get_governorate_code(self, governorate_name: str) -> str:
        """
        Get ISO 3166-2 governorate code.

        Args:
            governorate_name: Governorate name (e.g., "Cairo", "Alexandria")

        Returns:
            ISO 3166-2 subdivision code (e.g., "C", "ALX")
            Returns "00" if governorate not found (national-level fallback)
        """
        normalized = self._normalize_name(governorate_name)

        if normalized in self.eg_mapping:
            return self.eg_mapping[normalized]

        # Check city->governorate inference
        if normalized in self.city_to_governorate:
            return self.city_to_governorate[normalized]

        # Fallback: return "00" for national-level
        return "00"


class EgyptLocationInference:
    """Infer location data from various sources."""

    # National institutions keywords (likely in Cairo)
    NATIONAL_KEYWORDS = [
        'national', 'egyptian', 'egypt', 'dar al-kutub', 'dar al-mahfuzat',
        'grand egyptian museum', 'egyptian museum cairo'
    ]

    # City extraction patterns from addresses
    CITY_PATTERNS = [
        r',\s*([A-Za-z\s]+)\s+\d{4,}',  # ", Alexandria 21526"
        r',\s*([A-Za-z\s]+),\s*Egypt',   # ", Cairo, Egypt"
        r'\b([A-Za-z\s]+)\s+\d{4,}\s*,?\s*Egypt',  # "Alexandria 21526, Egypt"
    ]

    @classmethod
    def infer_location(cls, record: dict) -> Tuple[Optional[str], Optional[str]]:
        """
        Infer city and governorate from institution record.

        Args:
            record: Institution record (dict)

        Returns:
            Tuple of (city_name, governorate_name) - may be None
        """
        locations = record.get('locations', [])
        name = record.get('name', '').lower()

        # Strategy 1: Use existing location data
        if locations:
            location = locations[0]
            city = location.get('city')

            # Check if city is actually a street/landmark
            if city and not cls._is_landmark(city):
                # Infer governorate from city
                governorate = cls._infer_governorate_from_city(city)
                return city, governorate

            # Try extracting from street address
            address = location.get('street_address', '')
            if address:
                extracted_city = cls._extract_city_from_address(address)
                if extracted_city:
                    governorate = cls._infer_governorate_from_city(extracted_city)
                    return extracted_city, governorate

        # Strategy 2: Infer Cairo for national institutions
        if any(keyword in name for keyword in cls.NATIONAL_KEYWORDS):
            return 'Cairo', 'Cairo'

        # Strategy 3: No location data
        return None, None

    @staticmethod
    def _is_landmark(city_name: str) -> bool:
        """Check if 'city' is actually a landmark/street."""
        landmarks = [
            'nile corniche', 'tahrir square', 'chatby',
            'downtown', 'zamalek', 'garden city'
        ]
        return city_name.lower() in landmarks

    @classmethod
    def _extract_city_from_address(cls, address: str) -> Optional[str]:
        """Extract city name from street address."""
        for pattern in cls.CITY_PATTERNS:
            match = re.search(pattern, address, re.IGNORECASE)
            if match:
                city = match.group(1).strip()
                # Filter out postal codes, Egypt
                if city.lower() not in ['egypt', 'eg'] and not city.isdigit():
                    return city
        return None

    @staticmethod
    def _infer_governorate_from_city(city_name: str) -> Optional[str]:
        """Map city name to governorate."""
        city_upper = city_name.upper()

        # Major cities that match governorate names
        major_cities = {
            'CAIRO': 'Cairo',
            'ALEXANDRIA': 'Alexandria',
            'GIZA': 'Giza',
            'LUXOR': 'Luxor',
            'ASWAN': 'Aswan',
            'PORT SAID': 'Port Said',
            'SUEZ': 'Suez',
        }

        for city_key, governorate in major_cities.items():
            if city_key in city_upper:
                return governorate

        return None


class EgyptGHCIDGenerator:
    """Generate GHCIDs for Egyptian institutions."""

    def __init__(self):
        """Initialize generator with dependencies."""
        self.ghcid_gen = GHCIDGenerator()
        self.region_mapper = EgyptRegionMapper()
        self.geonames_db = GeoNamesDB()

        # Statistics
        self.stats = {
            'total_institutions': 0,
            'ghcids_generated': 0,
            'location_inferred': 0,
            'defaulted_to_cairo': 0,
            'missing_city_code': 0,
            'missing_governorate_code': 0,
            'collisions_detected': 0,
            'errors': [],
        }

        # Collision detection
        self.ghcid_usage: Dict[str, List[str]] = defaultdict(list)  # GHCID -> [institution_names]

    @staticmethod
    def _get_city_code_fallback(city_name: str) -> str:
        """
        Generate 3-letter city code from city name.

        Args:
            city_name: City name (e.g., "Cairo", "Alexandria")

        Returns:
            3-letter uppercase code (e.g., "CAI", "ALE")
        """
        import unicodedata

        # Remove accents
        normalized = unicodedata.normalize('NFD', city_name)
        normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

        # Split into words
        words = normalized.split()

        if len(words) == 1:
            # Single word: take first 3 letters
            code = words[0][:3].upper()
        else:
            # Multi-word: take first letter of each word (up to 3)
            code = ''.join(w[0] for w in words[:3]).upper()

        # Ensure exactly 3 letters
        if len(code) < 3:
            code = code.ljust(3, 'X')
        elif len(code) > 3:
            code = code[:3]

        return code

    def generate_for_institution(self, record: dict) -> Optional[GHCIDComponents]:
        """
        Generate GHCID for a single Egyptian institution.

        Args:
            record: Institution record from YAML (dict)

        Returns:
            GHCIDComponents if successful, None otherwise
        """
        self.stats['total_institutions'] += 1

        try:
            # Extract required fields
            name = record.get('name')
            institution_type_str = record.get('institution_type', 'UNKNOWN')

            if not name:
                self.stats['errors'].append(f"Missing name for record: {record.get('id')}")
                return None

            # Country code (always EG)
            country_code = "EG"

            # Infer location data
            city_name, governorate_name = EgyptLocationInference.infer_location(record)

            if not city_name:
                # Default to Cairo for national institutions
                if any(keyword in name.lower() for keyword in EgyptLocationInference.NATIONAL_KEYWORDS):
                    city_name = "Cairo"
                    governorate_name = "Cairo"
                    self.stats['defaulted_to_cairo'] += 1
                else:
                    self.stats['errors'].append(f"No location data for: {name}")
                    return None
            else:
                if governorate_name:
                    self.stats['location_inferred'] += 1

            # Get governorate code (ISO 3166-2)
            governorate_code = "00"  # Default to national-level
            if governorate_name:
                governorate_code = self.region_mapper.get_governorate_code(governorate_name)
                if governorate_code == "00":
                    self.stats['missing_governorate_code'] += 1

            # Get city code from GeoNames
            city_code = "XXX"  # Default for unknown
            if city_name:
                city_info = self.geonames_db.lookup_city(city_name, country_code)
                if city_info:
                    city_code = city_info.get_abbreviation()
                else:
                    self.stats['missing_city_code'] += 1
                    # Fallback: use first 3 letters of city name
                    city_code = self._get_city_code_fallback(city_name)

            # Map institution type to GHCID type code
            try:
                inst_type = InstitutionType[institution_type_str]
            except KeyError:
                inst_type = InstitutionType.UNKNOWN

            # Generate abbreviation from name
            abbreviation = extract_abbreviation_from_name(name)

            # Create GHCID components
            components = GHCIDComponents(
                country_code=country_code,
                region_code=governorate_code,
                city_locode=city_code,
                institution_type=inst_type.value,
                abbreviation=abbreviation,
            )

            # Validate
            is_valid, error_msg = components.validate()
            if not is_valid:
                self.stats['errors'].append(f"Invalid GHCID for {name}: {error_msg}")
                return None

            # Check for collisions (before Q-number)
            base_ghcid = components.to_string()
            self.ghcid_usage[base_ghcid].append(name)
            if len(self.ghcid_usage[base_ghcid]) > 1:
                self.stats['collisions_detected'] += 1

            self.stats['ghcids_generated'] += 1
            return components

        except Exception as e:
            self.stats['errors'].append(f"Error generating GHCID for {record.get('name', 'unknown')}: {e}")
            return None

    def process_all_institutions(self, input_file: Path) -> List[dict]:
        """
        Process all institutions in YAML file and generate GHCIDs.

        Args:
            input_file: Path to Egyptian institutions YAML file

        Returns:
            List of updated institution records with GHCID fields
        """
        print(f"Loading Egyptian institutions from: {input_file}")

        with open(input_file, 'r', encoding='utf-8') as f:
            institutions = yaml.safe_load(f)

        print(f"Found {len(institutions)} institutions")
        print()

        updated_institutions = []

        for i, record in enumerate(institutions, 1):
            print(f"Processing {i}/{len(institutions)}: {record.get('name', 'unknown')}")

            # Generate GHCID
            ghcid_components = self.generate_for_institution(record)

            if ghcid_components:
                # Check for Wikidata Q-number (for collision resolution)
                wikidata_qid = None
                identifiers = record.get('identifiers', [])
                for identifier in identifiers:
                    if identifier.get('identifier_scheme') == 'Wikidata':
                        wikidata_qid = identifier.get('identifier_value')
                        break

                # If collision exists and we have Q-number, append it
                base_ghcid = ghcid_components.to_string()
                if len(self.ghcid_usage[base_ghcid]) > 1 and wikidata_qid:
                    # Append Q-number for disambiguation
                    ghcid_with_q = f"{base_ghcid}-{wikidata_qid}"
                    record['ghcid'] = ghcid_with_q
                    print(f"  → Collision detected, using GHCID with Q-number: {ghcid_with_q}")
                else:
                    record['ghcid'] = base_ghcid
                    print(f"  → GHCID: {base_ghcid}")

                # Add UUID v5 (SHA-1) - PRIMARY identifier
                record['ghcid_uuid'] = str(ghcid_components.to_uuid())

                # Add UUID v8 (SHA-256) - Secondary identifier
                record['ghcid_uuid_sha256'] = str(ghcid_components.to_uuid_sha256())

                # Add numeric identifier
                record['ghcid_numeric'] = ghcid_components.to_numeric()

                # Add GHCID to identifiers list
                has_ghcid = any(i.get('identifier_scheme') == 'GHCID' for i in identifiers)
                if not has_ghcid:
                    identifiers.append({
                        'identifier_scheme': 'GHCID',
                        'identifier_value': record['ghcid'],
                    })

                record['identifiers'] = identifiers

                # Update provenance with GHCID generation metadata
                provenance = record.get('provenance', {})
                provenance['ghcid_generation'] = {
                    'generated_date': datetime.now(timezone.utc).isoformat(),
                    'generation_method': 'EgyptGHCIDGenerator with location inference',
                    'base_ghcid': base_ghcid,
                    'has_wikidata_disambiguation': wikidata_qid is not None,
                }
                record['provenance'] = provenance

            updated_institutions.append(record)

        return updated_institutions

    def print_statistics(self):
        """Print generation statistics."""
        print()
        print("=" * 70)
        print("EGYPT GHCID GENERATION STATISTICS")
        print("=" * 70)
        print(f"Total institutions processed:       {self.stats['total_institutions']}")
        print(f"GHCIDs successfully generated:      {self.stats['ghcids_generated']}")
        print(f"Locations inferred from data:       {self.stats['location_inferred']}")
        print(f"Defaulted to Cairo (national inst): {self.stats['defaulted_to_cairo']}")
        print(f"Missing city codes (used fallback): {self.stats['missing_city_code']}")
        print(f"Missing governorate codes ('00'):   {self.stats['missing_governorate_code']}")
        print(f"GHCID collisions detected:          {self.stats['collisions_detected']}")
        print()

        if self.stats['errors']:
            print(f"⚠️  Errors encountered: {len(self.stats['errors'])}")
            print()
            print("Error details:")
            for error in self.stats['errors']:
                print(f"  - {error}")
        else:
            print("✅ No errors!")

        print()

        # Show collisions
        if self.stats['collisions_detected'] > 0:
            print("⚠️  GHCID COLLISIONS DETECTED:")
            print()
            for ghcid, names in self.ghcid_usage.items():
                if len(names) > 1:
                    print(f"  {ghcid}:")
                    for name in names:
                        print(f"    - {name}")
            print()
            print("Note: Collisions resolved with Wikidata Q-numbers where available")
        else:
            print("✅ No GHCID collisions detected!")

        print()

    def validate_ghcids(self, institutions: List[dict]):
        """
        Validate all generated GHCIDs.

        Args:
            institutions: List of institution records
        """
        print("=" * 70)
        print("VALIDATION")
        print("=" * 70)

        ghcid_set = set()
        numeric_set = set()
        uuid_v5_set = set()
        uuid_v8_set = set()
        duplicates = []

        for record in institutions:
            ghcid = record.get('ghcid')
            ghcid_numeric = record.get('ghcid_numeric')
            ghcid_uuid = record.get('ghcid_uuid')
            ghcid_uuid_sha256 = record.get('ghcid_uuid_sha256')

            if ghcid:
                if ghcid in ghcid_set:
                    duplicates.append(ghcid)
                ghcid_set.add(ghcid)

            if ghcid_numeric:
                numeric_set.add(ghcid_numeric)

            if ghcid_uuid:
                uuid_v5_set.add(ghcid_uuid)

            if ghcid_uuid_sha256:
                uuid_v8_set.add(ghcid_uuid_sha256)

        print(f"Unique GHCIDs (with Q-numbers):     {len(ghcid_set)}")
        print(f"Unique numeric GHCIDs:              {len(numeric_set)}")
        print(f"Unique UUID v5 (SHA-1) identifiers: {len(uuid_v5_set)}")
        print(f"Unique UUID v8 (SHA-256) ids:       {len(uuid_v8_set)}")

        if duplicates:
            print(f"⚠️  Duplicate GHCIDs found: {len(duplicates)}")
            for dup in duplicates:
                print(f"  - {dup}")
        else:
            print("✅ All GHCIDs are unique!")

        print()


def main():
    """Main entry point."""
    # Paths
    project_root = Path(__file__).parent.parent
    input_file = project_root / "data" / "instances" / "egypt_institutions_wikidata_viaf.yaml"
    output_file = project_root / "data" / "instances" / "egypt_institutions_ghcid.yaml"
    backup_file = project_root / "data" / "instances" / "archive" / f"egypt_institutions_pre_ghcid_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"

    # Create backup
    print(f"Creating backup: {backup_file}")
    backup_file.parent.mkdir(parents=True, exist_ok=True)
    import shutil
    shutil.copy(input_file, backup_file)
    print()

    # Generate GHCIDs
    generator = EgyptGHCIDGenerator()
    updated_institutions = generator.process_all_institutions(input_file)

    # Print statistics
    generator.print_statistics()

    # Validate
    generator.validate_ghcids(updated_institutions)

    # Write updated YAML
    print("=" * 70)
    print(f"Writing updated YAML to: {output_file}")

    # Add header comment
    header = f"""---
# Egyptian GLAM Institutions - GHCID Enhanced
# Last updated: {datetime.now(timezone.utc).isoformat()}
# GHCID generation: {generator.stats['ghcids_generated']}/{generator.stats['total_institutions']} institutions
#
# GHCID Statistics:
# - Total institutions: {generator.stats['total_institutions']}
# - GHCIDs generated: {generator.stats['ghcids_generated']}
# - Locations inferred: {generator.stats['location_inferred']}
# - Defaulted to Cairo: {generator.stats['defaulted_to_cairo']}
# - Missing city codes: {generator.stats['missing_city_code']}
# - Missing governorate codes: {generator.stats['missing_governorate_code']}
# - Collisions detected: {generator.stats['collisions_detected']}
#
# Four-Identifier Strategy:
# - ghcid: Base GHCID string (with Q-number for collisions)
# - ghcid_uuid: UUID v5 (SHA-1) - PRIMARY persistent identifier
# - ghcid_uuid_sha256: UUID v8 (SHA-256) - Secondary identifier
# - ghcid_numeric: 64-bit numeric for CSV exports
#
# Location Inference:
# - Extracted cities from street addresses (e.g., "Chatby, Alexandria")
# - Defaulted national institutions to Cairo
# - Used fallback city codes when GeoNames lookup failed

"""

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(header)
        yaml.dump(updated_institutions, f,
                  default_flow_style=False,
                  allow_unicode=True,
                  sort_keys=False,
                  width=100)

    print(f"✅ Done! Updated {len(updated_institutions)} institutions")
    print(f"✅ Output file: {output_file}")
    print()


if __name__ == "__main__":
    main()