glam/scripts/generate_ghcids_latin_america.py

#!/usr/bin/env python3
"""
Generate GHCIDs for Latin American GLAM institutions.

This script:
1. Loads the authoritative Latin American institutions YAML file
2. Maps region names to ISO 3166-2 codes (BR, CL, MX)
3. Looks up city codes from GeoNames database
4. Generates GHCID identifiers for each institution
5. Updates the YAML file with GHCID fields
6. Validates uniqueness and format

Usage:
    python scripts/generate_ghcids_latin_america.py
"""

import json
import sys
import yaml
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Set

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from glam_extractor.identifiers.ghcid import (
    GHCIDGenerator,
    GHCIDComponents,
    InstitutionType,
    extract_abbreviation_from_name,
)
from glam_extractor.geocoding.geonames_lookup import GeoNamesDB


class RegionMapper:
    """Maps full region names to ISO 3166-2 subdivision codes."""

    def __init__(self):
        """Load ISO 3166-2 mappings from reference data."""
        self.reference_dir = Path(__file__).parent.parent / "data" / "reference"

        # Load mappings for each country
        self.br_mapping = self._load_mapping("iso_3166_2_br.json", reverse=True)
        self.cl_mapping = self._load_mapping("iso_3166_2_cl.json", reverse=True)
        self.mx_mapping = self._load_mapping("iso_3166_2_mx.json", reverse=True)
        self.us_mapping = self._load_mapping("iso_3166_2_us.json", reverse=True)
        self.ar_mapping = self._load_mapping("iso_3166_2_ar.json", reverse=True)

    def _load_mapping(self, filename: str, reverse: bool = False) -> Dict[str, str]:
        """
        Load ISO 3166-2 mapping from JSON file.

        Args:
            filename: JSON file in data/reference/
            reverse: If True, create name->code mapping (default is code->name)
        """
        filepath = self.reference_dir / filename
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)

        if reverse:
            # Create normalized name -> code mapping
            mapping = {}
            for code, name in data.items():
                # Normalize: uppercase, strip accents for lookup
                normalized_name = self._normalize_region_name(name)
                mapping[normalized_name] = code
            return mapping
        return data

    @staticmethod
    def _normalize_region_name(name: str) -> str:
        """
        Normalize region name for lookup.

        - Uppercase
        - Remove accents (é->E, ã->A, ñ->N, etc.)
        - Strip whitespace
        """
        import unicodedata

        # Uppercase
        normalized = name.upper()

        # Remove accents (NFD decomposition)
        normalized = unicodedata.normalize('NFD', normalized)
        normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

        # Strip whitespace
        normalized = normalized.strip()

        return normalized

    def get_region_code(self, region_name: str, country_code: str) -> Optional[str]:
        """
        Get ISO 3166-2 subdivision code for a region.

        Args:
            region_name: Full region name (e.g., "São Paulo", "Valparaíso")
            country_code: ISO 3166-1 country code (e.g., "BR", "CL", "MX")

        Returns:
            ISO 3166-2 subdivision code (e.g., "SP", "VS", "CMX")
            Returns "00" if region not found (national-level fallback)
        """
        normalized = self._normalize_region_name(region_name)

        mapping = None
        if country_code == "BR":
            mapping = self.br_mapping
        elif country_code == "CL":
            mapping = self.cl_mapping
        elif country_code == "MX":
            mapping = self.mx_mapping
        elif country_code == "US":
            mapping = self.us_mapping
        elif country_code == "AR":
            mapping = self.ar_mapping

        if mapping and normalized in mapping:
            return mapping[normalized]

        # Fallback: return "00" for national-level
        return "00"


class LatinAmericaGHCIDGenerator:
    """Generate GHCIDs for Latin American institutions."""

    def __init__(self):
        """Initialize generator with dependencies."""
        self.ghcid_gen = GHCIDGenerator()
        self.region_mapper = RegionMapper()
        self.geonames_db = GeoNamesDB()

        # Statistics
        self.stats = {
            'total_institutions': 0,
            'ghcids_generated': 0,
            'missing_city_code': 0,
            'missing_region_code': 0,
            'collisions_detected': 0,
            'errors': [],
        }

        # Collision detection
        self.ghcid_usage: Dict[str, List[str]] = defaultdict(list)  # GHCID -> [institution_names]

    @staticmethod
    def _get_city_code_fallback(city_name: str) -> str:
        """
        Generate 3-letter city code from city name.

        Handles city names with articles (La Serena, El Quisco, etc.)
        by taking first letter of article + first 2 letters of main word.

        Args:
            city_name: City name (e.g., "La Serena", "São Paulo")

        Returns:
            3-letter uppercase code (e.g., "LSE", "SAO")
        """
        import unicodedata

        # Remove accents
        normalized = unicodedata.normalize('NFD', city_name)
        normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

        # Split into words
        words = normalized.split()

        if len(words) == 1:
            # Single word: take first 3 letters
            code = words[0][:3].upper()
        elif words[0].lower() in ['la', 'el', 'los', 'las', 'o', 'a']:
            # City with article: first letter of article + first 2 of next word
            if len(words) > 1:
                code = (words[0][0] + words[1][:2]).upper()
            else:
                code = words[0][:3].upper()
        else:
            # Multi-word: take first letter of each word (up to 3)
            code = ''.join(w[0] for w in words[:3]).upper()

        # Ensure exactly 3 letters
        if len(code) < 3:
            code = code.ljust(3, 'X')
        elif len(code) > 3:
            code = code[:3]

        return code

    def generate_for_institution(self, record: dict) -> Optional[GHCIDComponents]:
        """
        Generate GHCID for a single institution record.

        Args:
            record: Institution record from YAML (dict)

        Returns:
            GHCIDComponents if successful, None otherwise
        """
        self.stats['total_institutions'] += 1

        try:
            # Extract required fields
            name = record.get('name')
            institution_type_str = record.get('institution_type', 'UNKNOWN')
            locations = record.get('locations', [])

            if not name:
                self.stats['errors'].append(f"Missing name for record: {record.get('id')}")
                return None

            if not locations:
                self.stats['errors'].append(f"No locations for: {name}")
                return None

            location = locations[0]  # Use first location
            country_code = location.get('country')
            region_name = location.get('region')
            city_name = location.get('city')

            if not country_code:
                self.stats['errors'].append(f"No country for: {name}")
                return None

            # Get region code (ISO 3166-2)
            region_code = "00"  # Default to national-level
            if region_name:
                region_code = self.region_mapper.get_region_code(region_name, country_code)
                if region_code == "00":
                    self.stats['missing_region_code'] += 1

            # Get city code from GeoNames
            city_code = "XXX"  # Default for unknown/region-level
            if city_name:
                city_info = self.geonames_db.lookup_city(city_name, country_code)
                if city_info:
                    city_code = city_info.get_abbreviation()
                else:
                    self.stats['missing_city_code'] += 1
                    # Fallback: use first 3 letters of city name
                    # Handle city names with articles (La Serena -> LSE, El Quisco -> ELQ)
                    city_code = self._get_city_code_fallback(city_name)

            # Map institution type to GHCID type code
            try:
                inst_type = InstitutionType[institution_type_str]
            except KeyError:
                inst_type = InstitutionType.UNKNOWN

            # Generate abbreviation from name
            abbreviation = extract_abbreviation_from_name(name)

            # Create GHCID components
            components = GHCIDComponents(
                country_code=country_code,
                region_code=region_code,
                city_locode=city_code,
                institution_type=inst_type.value,
                abbreviation=abbreviation,
            )

            # Validate
            is_valid, error_msg = components.validate()
            if not is_valid:
                self.stats['errors'].append(f"Invalid GHCID for {name}: {error_msg}")
                return None

            # Check for collisions
            ghcid_str = components.to_string()
            self.ghcid_usage[ghcid_str].append(name)
            if len(self.ghcid_usage[ghcid_str]) > 1:
                self.stats['collisions_detected'] += 1

            self.stats['ghcids_generated'] += 1
            return components

        except Exception as e:
            self.stats['errors'].append(f"Error generating GHCID for {record.get('name', 'unknown')}: {e}")
            return None

    def process_all_institutions(self, input_file: Path) -> List[dict]:
        """
        Process all institutions in YAML file and generate GHCIDs.

        Args:
            input_file: Path to authoritative YAML file

        Returns:
            List of updated institution records with GHCID fields
        """
        print(f"Loading institutions from: {input_file}")

        with open(input_file, 'r', encoding='utf-8') as f:
            institutions = yaml.safe_load(f)

        print(f"Found {len(institutions)} institutions")
        print()

        updated_institutions = []

        for i, record in enumerate(institutions, 1):
            if i % 50 == 0:
                print(f"Processing institution {i}/{len(institutions)}...")

            # Generate GHCID
            ghcid_components = self.generate_for_institution(record)

            if ghcid_components:
                # Add GHCID fields to record
                record['ghcid'] = ghcid_components.to_string()
                record['ghcid_numeric'] = ghcid_components.to_numeric()

                # Use numeric GHCID as the main ID
                old_id = record.get('id', '')
                record['id'] = ghcid_components.to_numeric()

                # Add GHCID to identifiers list
                identifiers = record.get('identifiers', [])

                # Add old ID to identifiers if it exists and isn't already there
                if old_id:
                    has_old_id = any(i.get('identifier_value') == old_id for i in identifiers)
                    if not has_old_id:
                        identifiers.append({
                            'identifier_scheme': 'OLD_ID',
                            'identifier_value': old_id,
                        })

                # Add GHCID identifier (if not already present)
                has_ghcid = any(i.get('identifier_scheme') == 'GHCID' for i in identifiers)
                if not has_ghcid:
                    identifiers.append({
                        'identifier_scheme': 'GHCID',
                        'identifier_value': ghcid_components.to_string(),
                    })

                # Add numeric GHCID to identifiers as well
                has_numeric = any(i.get('identifier_scheme') == 'GHCID_NUMERIC' for i in identifiers)
                if not has_numeric:
                    identifiers.append({
                        'identifier_scheme': 'GHCID_NUMERIC',
                        'identifier_value': str(ghcid_components.to_numeric()),
                    })

                record['identifiers'] = identifiers

            updated_institutions.append(record)

        return updated_institutions

    def print_statistics(self):
        """Print generation statistics."""
        print()
        print("=" * 70)
        print("GHCID GENERATION STATISTICS")
        print("=" * 70)
        print(f"Total institutions processed:  {self.stats['total_institutions']}")
        print(f"GHCIDs successfully generated: {self.stats['ghcids_generated']}")
        print(f"Missing city codes (used fallback): {self.stats['missing_city_code']}")
        print(f"Missing region codes (used '00'):  {self.stats['missing_region_code']}")
        print(f"GHCID collisions detected:      {self.stats['collisions_detected']}")
        print()

        if self.stats['errors']:
            print(f"⚠️  Errors encountered: {len(self.stats['errors'])}")
            print()
            print("Error details:")
            for error in self.stats['errors'][:10]:  # Show first 10
                print(f"  - {error}")
            if len(self.stats['errors']) > 10:
                print(f"  ... and {len(self.stats['errors']) - 10} more")
        else:
            print("✅ No errors!")

        print()

        # Show collisions
        if self.stats['collisions_detected'] > 0:
            print("⚠️  GHCID COLLISIONS DETECTED:")
            print()
            for ghcid, names in self.ghcid_usage.items():
                if len(names) > 1:
                    print(f"  {ghcid}:")
                    for name in names:
                        print(f"    - {name}")
            print()
            print("Note: Collisions will need Wikidata Q-numbers for disambiguation")
        else:
            print("✅ No GHCID collisions detected!")

        print()

    def validate_ghcids(self, institutions: List[dict]):
        """
        Validate all generated GHCIDs.

        Args:
            institutions: List of institution records
        """
        print("=" * 70)
        print("VALIDATION")
        print("=" * 70)

        ghcid_set = set()
        numeric_set = set()
        duplicates = []

        for record in institutions:
            ghcid = record.get('ghcid')
            ghcid_numeric = record.get('ghcid_numeric')

            if ghcid:
                if ghcid in ghcid_set:
                    duplicates.append(ghcid)
                ghcid_set.add(ghcid)

            if ghcid_numeric:
                numeric_set.add(ghcid_numeric)

        print(f"Unique GHCIDs: {len(ghcid_set)}")
        print(f"Unique numeric GHCIDs: {len(numeric_set)}")

        if duplicates:
            print(f"⚠️  Duplicate GHCIDs found: {len(duplicates)}")
            for dup in duplicates[:5]:
                print(f"  - {dup}")
        else:
            print("✅ All GHCIDs are unique!")

        print()


def main():
    """Main entry point."""
    # Paths
    project_root = Path(__file__).parent.parent
    input_file = project_root / "data" / "instances" / "latin_american_institutions_AUTHORITATIVE.yaml"
    output_file = project_root / "data" / "instances" / "latin_american_institutions_AUTHORITATIVE.yaml"
    backup_file = project_root / "data" / "instances" / "archive" / f"latin_american_institutions_pre_ghcid_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"

    # Create backup
    print(f"Creating backup: {backup_file}")
    backup_file.parent.mkdir(parents=True, exist_ok=True)
    import shutil
    shutil.copy(input_file, backup_file)

    # Generate GHCIDs
    generator = LatinAmericaGHCIDGenerator()
    updated_institutions = generator.process_all_institutions(input_file)

    # Print statistics
    generator.print_statistics()

    # Validate
    generator.validate_ghcids(updated_institutions)

    # Write updated YAML
    print("=" * 70)
    print(f"Writing updated YAML to: {output_file}")

    # Add header comment
    header = f"""---
# Latin American GLAM Institutions - GHCID Enhanced
# Last updated: {datetime.now(timezone.utc).isoformat()}
# GHCID generation: {generator.stats['ghcids_generated']}/{generator.stats['total_institutions']} institutions
#
# GHCID Statistics:
# - Total institutions: {generator.stats['total_institutions']}
# - GHCIDs generated: {generator.stats['ghcids_generated']}
# - Missing city codes: {generator.stats['missing_city_code']}
# - Missing region codes: {generator.stats['missing_region_code']}
# - Collisions detected: {generator.stats['collisions_detected']}

"""

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(header)
        yaml.dump(updated_institutions, f,
                  default_flow_style=False,
                  allow_unicode=True,
                  sort_keys=False,
                  width=100)

    print(f"✅ Done! Updated {len(updated_institutions)} institutions")
    print()


if __name__ == "__main__":
    main()